def __init__(self): if crawler_config.debug: self.db = MongoDB('test_stock_crawler') else: self.db = MongoDB('stock_crawler') self.url = ""
def __init__(self): self.mongo = MongoDB() self.news_url_queue = news_url_Queue() # 存新闻url,用于多线程爬取 self.news_html_queue = news_url_Queue() # 存新闻html self.old_day_news_queue = news_url_Queue() # self.log = Logging('../helloword/static/sina').get_logging() self.log = Logging('../Sina/sina.txt').get_logging()
def read_data_from_coupon_db(self, id=None): flag = 0 print "id = ",id if not id: return MongoDB().read_from_coupon_collection() else: for data_entry in MongoDB().read_from_coupon_collection(): if id in data_entry.values(): print "data_entry = ", data_entry reqd_data = data_entry flag = 1 if flag == 1: del reqd_data['_id'] return reqd_data else: return 'Data not available'
def crawl_zhilian(self, city, keyword): #url_list = [] # todo url_list 做成堆栈形式 begin_url = 'https://fe-api.zhaopin.com/c/i/sou?start={page}&pageSize=90&cityId={city}&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw={keyword}&kt=3' database = MongoDB('zhilian', self.keywords[keyword]) url_list = self._get_list(begin_url, city, keyword, page_weight=90) print(keyword, city, 'list parser done!') print(len(url_list)) self._get_content(database, url_list)
def crawl_qiancheng(self, city, keyword): begin_url = 'https://search.51job.com/list/{city},000000,0000,00,9,99,{keyword},2,{page}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=' database = MongoDB('qiancheng', self.keywords[keyword]) url_list = self._get_list(begin_url, city, keyword, page_begin=1, web_name='qiancheng') print(keyword, city, 'list parser done!') if url_list: print(len(url_list)) self._get_content(database, url_list, web_name='qiancheng')
def crawl_liepin(self, city, keyword): begin_url = "https://www.liepin.com/city-{city}/zhaopin/pn{page}/?d_pageSize=40&jobKind=2&key={keyword}" database = MongoDB('liepin', self.keywords[keyword]) url_list = self._get_list(begin_url, city, keyword, page_begin=0, web_name='liepin') print(keyword, city, 'list parser done!') if url_list: print(len(url_list)) self._get_content(database, url_list, web_name='liepin')
def main(): m = MongoDB() tasks = [] all_pairs = m.get_all_pairs() for address in all_pairs[:100]: pair = Web3.toChecksumAddress(address) task = log_loop(pair, 60) tasks.append(task) print('{} Starting...'.format(len(tasks))) loop = asyncio.get_event_loop() try: loop.run_until_complete(asyncio.gather(*tasks)) finally: loop.close()
def get_all_status_thread(self, status_list=[], collect_name='status', trim_user=True, include_entities=True): wrapper_func = handle_exception(self.get_status) db = MongoDB().connect() collect = db[collect_name] while len(status_list) > 0: status_id = status_list.pop(0) status_obj = wrapper_func(status_id) status = self.tweetobj_to_dict(status_obj) if not status: continue try: collect.insert_one(status) except Exception as e: continue
def getCookies(weibo): db = MongoDB() cookies = [] loginURL = 'https://passport.weibo.cn/signin/login' if db.Cookies.count() < 10: print '-----------------------------------------' print 'Start crawl cookies' print '-----------------------------------------' for elem in weibo: account = elem['no'] password = elem['psw'] item = {'account':account} if db.find_cookie(item): continue try: driver = webdriver.Chrome() driver.get(loginURL) time.sleep(2) failure = 0 while "登录 - 新浪微博" in driver.title and failure < 5: failure += 1 driver.set_window_size(1920, 1080) username = driver.find_element_by_id("loginName") username.clear() username.send_keys(account) psd = driver.find_element_by_id("loginPassword") psd.clear() psd.send_keys(password) commit = driver.find_element_by_id("loginAction") commit.click() time.sleep(10) # cookie=driver.get_cookies() # print cookie cookie = {} if "微博 - 随时随地发现新鲜事" in driver.title: for elem in driver.get_cookies(): cookie[elem["name"]] = elem["value"] if len(cookie) > 0: item = {'account': account, 'password': password, 'cookie': cookie} db.Cookies.insert_one(item) cookies.append(cookie) print "*******************************" print "Get Cookie Successful: %s!!!!!!" % account print "*******************************" continue print "*******************************" print "Get Cookie Failed: %s!" % account print "*******************************" except Exception, e: print "*******************************" print "%s Failure!!!!!" % account print e print "*******************************" finally:
decimals = 18 except Exception as e: traceback.print_exc() sys.exit(-1) print('{} {}({}) {}'.format(token, name, symbol, decimals)) doc = { 'address': token, 'name': name, 'symbol': symbol, 'decimals': decimals, } return doc if __name__ == '__main__': db = MongoDB() tokens = db.get_all_tokens() parse_tokens = [] for token in tokens: if db.get_token(token) is None: parse_tokens.append(token) # if len(parse_tokens) == 6: # break print(len(parse_tokens)) with concurrent.futures.ThreadPoolExecutor() as executor: futures = [] for token in parse_tokens: futures.append(executor.submit(parse_token, token)) for future in concurrent.futures.as_completed(futures): doc = future.result()
def get_user_all_timeline(self, user_id=None, collect_name="tweets_task", screen_name=None, include_rts=True, exclude_replies=False): if user_id == None and screen_name == None: return None if user_id: try: user_id = long(user_id) except Exception as e: print e return None flag = True tweets = [0] sleep_count = 0 db = MongoDB().connect() collect = db[collect_name] get_api = self.get_api while len(tweets) > 0: try: if flag: tweets = get_api().GetUserTimeline( user_id=user_id, screen_name=screen_name, include_rts=include_rts, exclude_replies=exclude_replies, trim_user=True, count=200) flag = False else: tweets = get_api().GetUserTimeline( user_id=user_id, screen_name=screen_name, include_rts=include_rts, exclude_replies=exclude_replies, trim_user=True, count=200, max_id=tweets[-1].id - 1) except error.TwitterError as te: try: if te.message == 'Not authorized.': print 'Not authorized.' return if te.message[0]['code'] == 88: sleep_count += 1 if sleep_count >= API_COUNT: print "sleeping..." sleep_count = 0 time.sleep(300) continue else: print te break except Exception as ee: print ee break except Exception as e: break for tt in tweets: tweet = self.tweetobj_to_dict(tt) if not tweet: continue try: collect.insert_one(tweet) except Exception as e: continue
logger.info(f'Converting: {src_collection.__name__} ({db.get_collection_size(src_collection)})' f' -> {dst_collection.__name__} ({db.get_collection_size(dst_collection)})') docs = src_collection.objects() total_count = docs.count() for current_count, src_doc in enumerate(docs): log_progress(current_count, total_count) try: mapped_doc = map_document(src_doc) except (DocumentConversionError, DocumentConstructionError) as e: logger.warning(f'Skipping: {src_doc} because of: {e}') continue mapped_doc.create_or_update() with db.connect(): logger.info(f'Total {dst_collection.__name__} count: ({db.get_collection_size(dst_collection)})') logger.info(f'Documents.Conclusion count: ({db.get_collection_size(Documents.Conclusion)})') logger.info(f'Documents.License count: ({db.get_collection_size(Documents.License)})') if __name__ == '__main__': logger = root_logger('convert_data', logging.INFO) try: db = MongoDB() # credentials for MongoDB can be set up here convert_data(Documents.FileRaw, Documents.File) logger.info('Success') except Exception as e: logger.info(e, exc_info=True)
async def get_and_store(self, device): """ Get snmp infomation and add to database """ mongo = MongoDB() host = device.ip community = device.snmp_community port = device.snmp_port results = await asyncio.gather( asyncio.ensure_future(get_system_info(host, community, port)), asyncio.ensure_future(get_routes(host, community, port)), asyncio.ensure_future(get_ip_addr(host, community, port)), asyncio.ensure_future(get_interfaces(host, community, port)), asyncio.ensure_future(get_cdp(host, community, port)), # asyncio.ensure_future(get_lldp(host, community, port)), # Todo ) if all(r is None for r in results): logging.debug("SNMP Server for device ip %s is gone down", host) return system_info = results[0] routes = results[1] ip_addrs = results[2] interfaces = results[3] # CDP cdp = results[4] # LLDP # lldp = results[5] # Todo optimize this # for if_index, interface in enumerate(interfaces): # for ip_index, ip_addr in enumerate(ip_addrs): # if interface['index'] == ip_addr['if_index']: # interface['ipv4_address'] = ip_addr['ipv4_address'] # interface['subnet'] = ip_addr['subnet'] for if_index in range(len(interfaces)): for ip_index in range(len(ip_addrs)): if interfaces[if_index]['index'] == ip_addrs[ip_index][ 'if_index']: interfaces[if_index]['ipv4_address'] = ip_addrs[ip_index][ 'ipv4_address'] interfaces[if_index]['subnet'] = ip_addrs[ip_index][ 'subnet'] break # print(interfaces[0]) my_device = mongo.db.device.find_one({'device_ip': host}) if my_device: for interface in interfaces: for my_interface in my_device['interfaces']: if interface['description'] == my_interface['description']: # In in_octets = interface['out_octets'] - my_interface[ 'out_octets'] in_in_time = system_info['uptime'] - my_device['uptime'] bw_in_usage_percent = sdn_utils.cal_bw_usage_percent( in_octets, interface['speed'], in_in_time) # Out out_octets = interface['out_octets'] - my_interface[ 'out_octets'] out_in_time = system_info['uptime'] - my_device[ 'uptime'] bw_out_usage_percent = sdn_utils.cal_bw_usage_percent( out_octets, interface['speed'], out_in_time) # Add information interface['bw_in_usage_octets'] = in_octets interface['bw_in_usage_percent'] = bw_in_usage_percent interface['bw_out_usage_octets'] = out_octets interface[ 'bw_out_usage_percent'] = bw_out_usage_percent interface['bw_usage_update'] = time.time() logging.debug(' || BW in usage %.3f || %d bytes', bw_in_usage_percent, in_octets) logging.debug(' || BW out usage %.3f || %d bytes', bw_out_usage_percent, out_octets) break system_info['interfaces'] = interfaces # Clear old routes mongo.db.route.delete_many({'device_ip': host}) # Insert net routes mongo.db.route.insert_many(routes) mongo.device.update_one({'ipv4_address': host}, {'$set': system_info}, upsert=True) # Insert CDP mongo.db.cdp.update_one({'device_ip': host}, {'$set': { 'device_ip': host, 'neighbor': cdp }}, upsert=True)
def drop_coupon_table(self): MongoDB().drop_coupon_collection()
def write_data_to_coupon_db(self, info): MongoDB().write_to_coupon_collection(info)
def drop_product_table(self): MongoDB().drop_product_collection()
def write_data_to_product_db(self, info): MongoDB().write_to_product_collection(info)
def run_server(): global db db = MongoDB() run(host=host, port=port) db.clear_db()
# data['临床表现'] = data['临床表现'].str.replace(r'等.*?(?:\s|$)|[,。,.、;;]', ' ') # data['临床表现'] = data['临床表现'].str.replace(r'或|常伴|伴有?|发生|[轻甚]则|甚至', '') # data['临床表现'] = data['临床表现'].str.replace(r'[^\s]{9,}', ' ') # data['临床表现'] = data['临床表现'].str.replace(r'\s+', ' ') data['临床表现'] = data['临床表现'].str.strip() data.drop_duplicates('病证', 'last', inplace=True) # data = data['临床表现'].str.split().tolist() # data = [j for i in data for j in i] # counter = Counter(data) # print(data['临床表现']) # data.to_excel('bz2.xls', index=False) # bz = pd.read_excel('bz.xls')[['病症', '临床表现']] mongo = MongoDB() food_info = mongo.find_all('diet_merge', projection={'name': 1, 'ingredients': 1, 'syndrome': 1}) food_info_df = pd.DataFrame(data=[[f['name'], f['ingredients'], f['syndrome']] for f in food_info], columns=['食疗方', '食材', '主治']) food_info_df['存在关联'] = 0 food_info_df['主治'] = food_info_df['主治'].str.replace('证', '').str.replace('型', '') food_info_df.loc[food_info_df['主治'] != '', '主治'] = food_info_df['主治'] + '证' food_bz = [item['syndrome'] for item in food_info] food_bz = list(filter(None, food_bz)) print('Food(Total): {}'.format(len(food_bz))) food_bz = set([item.replace('证', '').replace('型', '') for item in food_bz]) results = [] n_valid_bz = 0 valid_bz_set = set() for item in food_bz:
# influx.py import time from datetime import datetime import concurrent from helper import DeFiContract from web3 import Web3 from web3 import exceptions from kfk import KafkaDB from database import MongoDB k = KafkaDB() m = MongoDB() def scrapReserves(pair_address): pair = Web3.toChecksumAddress(pair_address) contract = DeFiContract(pair, 'Pair') r0, r1, _ = contract.getReserves() print('{} {} {}'.format(pair, r0, r1)) doc = { 'address': pair, 'r0': r0, 'r1': r1, 't': datetime.utcnow().timestamp(), } return doc while True:
# -*- coding: utf-8 -*- from database import MongoDB import check, os from telegram.ext import Updater, CommandHandler, MessageHandler, Filters Token = "376593798:AAHMNABESGpXiFGiQ8Bg-0CnHc2EwyXD1hk" updater = Updater(token=Token) dispatcher = updater.dispatcher mongodb = MongoDB() admins = ["utkucanbykl", "vlademir92", "badgeekluck"] users = ["utkucanbykl", "vlademir92", "badgeekluck"] def start(bot, update): bot.sendMessage(chat_id=update.message.chat_id, text="Bot çalışıyor.") def hello(bot, update): bot.sendMessage(chat_id=update.message.chat_id, text="Hello " + update.message.from_user.first_name) def echo(bot, update):
from core import PastePwn from scraping.pastebin import PastebinScraper from database import MongoDB logdir_path = os.path.dirname(os.path.abspath(__file__)) logfile_path = os.path.join(logdir_path, "logs", "pastepwn.log") if not os.path.exists(os.path.join(logdir_path, "logs")): os.makedirs(os.path.join(logdir_path, "logs")) logfile_handler = logging.handlers.WatchedFileHandler(logfile_path, "a", "utf-8") logger = logging.getLogger(__name__) logging.basicConfig(format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.DEBUG, handlers=[logfile_handler, logging.StreamHandler()]) # Framework code database = MongoDB(ip="192.168.240.128") pastepwn = PastePwn(database) pastepwn.add_scraper(PastebinScraper()) telegram_action = TelegramAction(token="token", receiver="-1001348376474") mail_analyzer = MailAnalyzer(telegram_action) premium_analyzer = WordAnalyzer(telegram_action, "premium") pastepwn.add_analyzer(mail_analyzer) pastepwn.add_analyzer(premium_analyzer) pastepwn.start()