def main(): db = MongoDB() def begin_callback(): log.info('\n********** live_app begin **********') db.delete('LiveApp_urls', {}) db.update('LiveApp_anchor_info', {}, {"live_view": 0}) db.update('LiveApp_anchor_info', {}, {"watched_count": 0}) db.update('LiveApp_anchor_info', {}, {'read_status': 0}) def end_callback(): # 更新关键词状态 做完 log.info('\n********** live_app end **********') export_data.main() # 配置spider spider = Spider(tab_urls='LiveApp_urls', tab_site='LiveApp_site_info', tab_content='LiveApp_anchor_info', parser_count=1, begin_callback=begin_callback, end_callback=end_callback, content_unique_key='room_id') # 添加parser spider.add_parser(inke_parser) spider.add_parser(huajiao_parser) spider.add_parser(momo_parser) spider.start()
def main(): db = MongoDB() while True: def begin_callback(): log.info('\n********** proxies begin **********') db.delete('proxies_urls') def end_callback(): log.info('\n********** proxies end **********') # 更新任务状态 done # 导出数据 # export_data = ExportData(source_table = '', aim_table = '', key_map = '', unique_key = '') # export_data.export_to_oracle() # 配置spider spider = Spider(tab_urls='proxies_urls', tab_site='proxies_site_info', tab_content='proxies_content_info', parser_count=1, begin_callback=begin_callback, end_callback=end_callback, parser_params={}, content_unique_key='ip') # 添加parser spider.add_parser(gaoni_parser) spider.start() # time.sleep(60) break
def main(): db = MongoDB() def begin_callback(): log.info('\n********** wp begin **********') db.delete('WP_urls', {}) def end_callback(): # 更新关键词状态 做完 log.info('\n********** wp end **********') export_data.main() # 配置spider spider = Spider(tab_urls = 'WP_urls', tab_site = 'WP_site_info', tab_content = 'WP_content_info', parser_count = 20, begin_callback = begin_callback, end_callback = end_callback, content_unique_key = 'title') # 添加parser spider.add_parser(dongmanla_parser) # spider.add_parser(zx_novel_parser) # spider.add_parser(jisu_cartoon_parser) # spider.add_parser(ximalaya_parser) spider.start()
def main(): search_keyword1 = ['成龙'] search_keyword2 = [] search_keyword3 = [] def begin_callback(): log.info('\n********** VA begin **********') def end_callback(): # 更新关键词状态 做完 log.info('\n********** VA end **********') # 配置spider spider = Spider(tab_urls='VA_urls', tab_site='VA_site_info', tab_content='VA_content_info', parser_count=1, begin_callback=begin_callback, end_callback=end_callback, search_keyword1=search_keyword1, search_keyword2=search_keyword2, search_keyword3=search_keyword3) # 添加parser # spider.add_parser(baidu_parser) # spider.add_parser(magnet_parser) # spider.add_parser(netdisk_parser) # spider.add_parser(weibo_parser) # spider.add_parser(wechat_parser) # spider.add_parser(soubaidupan_parser) spider.add_parser(douban_parser) spider.start()
def main(): while True: if task_status.is_doing: #done log.debug('is doing sleep ...%ss' % SLEEP_TIME) time.sleep(SLEEP_TIME) continue task_status.is_doing = True keywords = Keywords().get_keywords() def begin_callback(): log.info('\n********** spider_main begin **********') def end_callback(): log.info('\n********** spider_main end **********') task_status.is_doing = False # 配置spider spider = Spider(tab_list, tab_unique_key_list, tab_ensure_index_list, parser_count=1, site_parsers=parser_siteid_list, begin_callback=begin_callback, end_callback=end_callback, parser_params=keywords) # 添加parser for parser in parser_list: spider.add_parser(parser) spider.start()
def main(): db = OracleDB() mongodb = MongoDB() sql = 'select t.KEYWORD, t.monitor_type from TAB_MVMS_SEARCH_INFO t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time and search_type = 702' result_list = db.find(sql, fetch_one=False) if not result_list: log.debug('无任务 结束') return parser_params = {'result_list': result_list} # parser_params = [] # for i in result: # parser_params.extend(str(i[0]).split(',')) def begin_callback(): log.info('\n********** WWA_weibo_user begin **********') mongodb.delete('WWA_weibo_user_urls') def end_callback(): # 导出数据 key_map = { 'id': 'int__id', 'name': 'str_name', 'sex': 'int_sex', 'summary': 'str_summary', 'fans_count': 'int_fans_count', 'blog_verified': 'str_blog_verified', 'is_verified': 'int_is_verified', 'account_url': 'str_url', 'follow_count': 'int_follow_count', 'image_url': 'str_image_url', 'monitor_status': 'vint_401', 'SEARCH_TYPE' : 'vint_702', 'region' : 'str_area' } export = ExportData('WWA_weibo_user_info', 'tab_mvms_weibo_info', key_map, 'account_url') export.export_to_oracle() log.info('\n********** WWA_weibo_user end **********') # 配置spider spider = Spider(tab_urls = 'WWA_weibo_user_urls', tab_site = 'WWA_site_info', tab_content = 'WWA_weibo_user_info', parser_count = 1, begin_callback = begin_callback, end_callback = end_callback, parser_params = parser_params) # 添加parser spider.add_parser(weibo_user_parser) spider.start()
def main(): while True: if task_status.is_doing: log.debug('正在做 不取任务') tools.delay_time(SEARCH_TASK_SLEEP_TIME) continue task_status.is_doing = True # 查找任务 get_task_url = MASTER_ADDRESS + '/task/get_task' print(get_task_url) update_task_url = MASTER_ADDRESS + '/task/update_task' data = tools.get_json_by_requests(get_task_url) # tasks = [[209690, '百度新闻', 11, 'http://news.baidu.com/?tn=news', 3]] print(data) tasks = data.get('tasks', []) parser_count = data.get('thread_count') def begin_callback(): log.info('\n********** news begin **********') # 更新任务状态 doing data = {'tasks': str(tasks), 'status': 602} if tools.get_json_by_requests(update_task_url, data=data): log.debug('更新任务状态 正在做...') def end_callback(): log.info('\n********** news end **********') task_status.is_doing = False data = {'tasks': str(tasks), 'status': 603} if tools.get_json_by_requests(update_task_url, data=data): log.debug('更新任务状态 已做完!') # 配置spider spider = Spider(tab_urls='news:news_urls', parser_count=parser_count, begin_callback=begin_callback, end_callback=end_callback, parser_params=tasks, delete_tab_urls=False) # 添加parser spider.add_parser(news_parser) spider.start()
def main(): db = MongoDB() db.set_unique_key('WWA_app_vioation_content_info', 'url') db.set_ensure_index('WWA_app_vioation_content_info', 'read_status') def begin_callback(): log.info('\n********** WWA_APP begin **********') db.delete('WWA_app_urls', {}) def end_callback(): export_data.main() log.info('\n********** WWA_APP end **********') # 配置spider spider = Spider(tab_urls = 'WWA_app_urls', tab_site = 'WWA_app_site_info', tab_content = 'WWA_app_content_info', parser_count = 1, begin_callback = begin_callback, end_callback = end_callback, parser_params = {}) # 添加parser spider.add_parser(headline_parser) spider.add_parser(kuaibao_parser) spider.start()
def main(): search_task_sleep_time = int( tools.get_conf_value('config.conf', 'task', 'search_task_sleep_time')) # 更新任务状态 正在做的更新为等待 while True: # 查询任务状态 有正在做的 sleep contine # TODO search_keyword1 = ['hi'] search_keyword2 = ['hello'] search_keyword3 = ['hello, hi'] task_id = 1 # 任务为空 sleep continue # TODO def begin_callback(): log.info('\n********** template begin **********') # 更新任务状态 doing def end_callback(): log.info('\n********** template end **********') # 更新任务状态 done # 导出数据 # export_data = ExportData(source_table = '', aim_table = '', key_map = '', unique_key = '') # export_data.export_to_oracle() # 配置spider # spider = Spider(tab_urls = 'template_urls', tab_site = 'template_site_info', tab_content = '', parser_count = 1, begin_callback = begin_callback, end_callback = end_callback) spider = Spider(tab_urls='template_urls', tab_site='template_site_info', tab_content='template_content_info', parser_count=1, begin_callback=begin_callback, end_callback=end_callback, search_keyword1=search_keyword1, search_keyword2=search_keyword2, search_keyword3=search_keyword3) # 添加parser spider.add_parser(xxx_parser) spider.add_parser(yyy_parser) spider.start() # time.sleep(search_task_sleep_time) break
def main(): def begin_callback(): log.info('\n********** VA_APP begin **********') db = MongoDB() db.delete('VAApp_urls', {}) def end_callback(): export_data.main() log.info('\n********** VA_APP end **********') # 配置spider spider = Spider(tab_urls='VAApp_urls', tab_site='VAApp_site_info', tab_content='VAApp_content_info', parser_count=1, begin_callback=begin_callback, end_callback=end_callback, parser_params={}) # 添加parser spider.add_parser(headline_parser) spider.add_parser(kuaibao_parser) spider.start()
def main(): oracledb = OracleDB() sql = 'select t.KEYWORD, t.monitor_type from TAB_MVMS_SEARCH_INFO t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time and search_type = 701' result_list = oracledb.find(sql) #[(keys, monitor_type),()] if not result_list: log.debug('无任务 结束') return # print(result_list) # keywords = [] # for result in result_list: # keywords.extend(result[0].split(',')) def begin_callback(): log.info('\n********** WWA_wechat_account begin **********') db = MongoDB() db.delete('WWA_wechat_account_url', {}) def end_callback(): log.info('\n********** WWA_wechat_account end **********') export_data.account_main() parser_params = {'result_list': result_list} # 配置spider spider = Spider(tab_urls='WWA_wechat_account_url', tab_site='WWA_wechat_site_info', tab_content='WWA_wechat_official_accounts', content_unique_key='account_id', parser_count=1, begin_callback=begin_callback, end_callback=end_callback, parser_params=parser_params) # 添加parser spider.add_parser(wechat_account_parser) spider.start()
def main(): oracledb = OracleDB() sql = 'select t.account_id, t.monitor_type from TAB_MVMS_WECHAT_INFO t where monitor_status = 402' result_list = oracledb.find(sql) if not result_list: log.debug('无任务 结束') return # keywords = [] # for result in result_list: # keywords.append(result[0]) def begin_callback(): log.info('\n********** WWA_wechat_article begin **********') db = MongoDB() db.delete('WWA_wechat_article_url', {}) def end_callback(): log.info('\n********** WWA_wechat_article end **********') export_data.article_main() parser_params = result_list # 配置spider spider = Spider(tab_urls='WWA_wechat_article_url', tab_site='WWA_wechat_site_info', tab_content='WWA_wechat_article', content_unique_key='title', parser_count=1, begin_callback=begin_callback, end_callback=end_callback, parser_params=parser_params) # 添加parser spider.add_parser(wechat_article_parser) spider.start()
def main(): def begin_callback(): # mongo_db = MongoDB() # mongo_db.update('ZHEJIANG_APP_urls', {'depth': 0}, {'status': 0}) log.info('\n********** spider_main begin **********') def end_callback(): log.info('\n********** spider_main end **********') # 配置spider spider = Spider(tab_list, tab_unique_key_list, tab_ensure_index_list, parser_count=1, site_parsers=parser_siteid_list, begin_callback=begin_callback, end_callback=end_callback, parser_params={}) # 添加parser for parser in parser_list: spider.add_parser(parser) spider.start()
def main(): db = OracleDB() sql = ''' select t.program_id, c.chan_name, program_name, d.name, t.image_url, t.official_blog from TAB_MMS_PROGRAM t left join tab_mam_chan c on c.chan_id = t.chan_id left join tab_mms_dictionary d on t.type = d.id and d.type = 2 ''' # where t.program_id = 226 program_info = db.find(sql) def begin_callback(): log.info('\n********** news begin **********') # 更新任务状态 doing def end_callback(): log.info('\n********** news end **********') # 配置spider spider = Spider(tab_urls='mms_urls', begin_callback=begin_callback, end_callback=end_callback, delete_tab_urls=True, parser_params=program_info) # 添加parser # spider.add_parser(iqiyi_hot_parser) spider.add_parser(iqiyi_search_parser) # spider.add_parser(weibo_user_parser) # spider.add_parser(weibo_article_parser) spider.start()
def main(): search_task_sleep_time = int( tools.get_conf_value('config.conf', 'task', 'search_task_sleep_time')) db = OracleDB() # 更新符合日期条件的任务状态 未做 sql = 'update tab_ivms_task_info t set t.task_status = 501 where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time' db.update(sql) # 更新关键词状态 未做 sql = 'update tab_ivms_task_keyword k set k.finish_status = 601 where k.task_id in (select t.task_id from tab_ivms_task_info t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time)' db.update(sql) while True: # 查任务 log.debug('查询任务...') sql = 'select t.task_id from TAB_IVMS_TASK_INFO t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time and t.task_status = 501' result = db.find(sql, fetch_one=True) if not result: break task_id = result[0] while True: # 查看是否有正在执行的任务 sql = 'select t.* from TAB_IVMS_TASK_KEYWORD t where t.task_id = %d and finish_status = 602' % task_id do_task = db.find(sql, fetch_one=True) if do_task: time.sleep(search_task_sleep_time) continue sql = 'select t.* from TAB_IVMS_TASK_KEYWORD t where t.task_id = %d and finish_status = 601' % task_id result = db.find(sql, fetch_one=True) if not result: break keyword_id = result[0] task_id = result[1] search_keyword1 = [] search_keyword2 = result[2].split(',') if result[2] else [] search_keyword3 = result[3].split(',') if result[3] else [] def begin_callback(): log.info('\n********** VA begin **********') # 更新任务状态 正在做 sql = 'update TAB_IVMS_TASK_INFO set task_status = 502 where task_id = %d' % task_id db.update(sql) # 更新关键词状态 正在做 sql = 'update tab_ivms_task_keyword set finish_status = 602 where id = %d' % keyword_id db.update(sql) def end_callback(): # 更新关键词状态 做完 sql = 'update tab_ivms_task_keyword set finish_status = 603 where id = %d' % keyword_id db.update(sql) # 如果该任务的所有关键词都做完 则更新任务状态为做完 sql = 'select t.* from tab_ivms_task_keyword t where task_id = %d and finish_status = 601' % task_id results = db.find(sql) if not results: # 导出数据 key_map = { 'program_id': 'vint_sequence.nextval', 'search_type': 'int_search_type', 'program_name': 'str_title', 'program_url': 'str_url', 'release_date': 'date_release_time', 'image_url': 'str_image_url', 'program_content': 'str_content', 'task_id': 'vint_%d' % task_id, 'keyword': 'str_keyword', 'keyword_count': 'int_keyword_count', 'check_status': 'vint_202' } export = ExportData('VA_content_info', 'tab_ivms_program_info', key_map, 'program_url') export.export_to_oracle() # 更新任务状态 做完 sql = 'update TAB_IVMS_TASK_INFO set task_status = 503 where task_id = %d' % task_id db.update(sql) log.info('\n********** VA end **********') # 配置spider spider = Spider(tab_urls='VA_urls', tab_site='VA_site_info', tab_content='VA_content_info', parser_count=1, begin_callback=begin_callback, end_callback=end_callback, search_keyword1=search_keyword1, search_keyword2=search_keyword2, search_keyword3=search_keyword3) # 添加parser spider.add_parser(baidu_parser) spider.add_parser(magnet_parser) spider.add_parser(netdisk_parser) spider.add_parser(weibo_parser) spider.add_parser(wechat_parser) spider.add_parser(soubaidupan_parser) spider.add_parser(douban_parser) spider.start() time.sleep(search_task_sleep_time)
def main(): def begin_callback(): log.info('\n********** spider_article begin **********') def end_callback(): log.info('\n********** spider_article end **********') # 配置spider spider = Spider(tab_urls='article_urls', tab_site='article_site_info', tab_content='article_text_info', parser_count=40, begin_callback=begin_callback, end_callback=end_callback, parser_params={}) # 添加parser spider.add_parser(cctv_parser) spider.add_parser(ifeng_parser) spider.add_parser(xinhua_parser) spider.add_parser(tencent_parser) spider.add_parser(sohu_parser) spider.add_parser(wangyi_parser) spider.add_parser(people_parser) spider.add_parser(sina_parser) spider.start()
def main(): db = MongoDB() def begin_callback(): log.info('\n********** template begin **********') db.delete('op_urls', {}) db.delete('op_content_info', {}) def end_callback(): log.info('\n********** template end **********') # 更新任务状态 done # 导出数据 # export_data = ExportData(source_table = '', aim_table = '', key_map = '', unique_key = '') # export_data.export_to_oracle() # 配置spider spider = Spider(tab_urls='op_urls', tab_site='op_site_info', tab_content='op_content_info', parser_count=20, begin_callback=begin_callback, end_callback=end_callback, parser_params={}) #添加parser spider.add_parser(luzhou_parser) spider.add_parser(longmatan_parser) spider.add_parser(naxi_parser) spider.add_parser(luxian_parser) spider.add_parser(hejiang_parser) spider.add_parser(gulin_parser) spider.add_parser(luzhouzhiye_parser) spider.add_parser(sichuanhuagong_parser) spider.add_parser(luzhougaozhong_parser) spider.add_parser(xuyong_parser) spider.add_parser(jiangyang_parser) spider.add_parser(luzhoutianli_parser) spider.add_parser(sichuanluxian_parser) spider.add_parser(sichuan_police_parser) spider.add_parser(sichuanyikeda_parser) spider.add_parser(luzhoubaidu_parser) spider.start()
def main(): db = MongoDB() oracle = OracleDB() def begin_callback(): #db.update('WWA_app_urls',{'depth':0}, {'status':0}) db.delete('WWA_search_app_urls') log.info('\n********** wwa begin **********') def end_callback(): log.info('\n********** wwa end **********') export_data.main() keywords = [] result_list = oracle.find( 'select keyword from TAB_MVMS_SEARCH_INFO where MONITOR_START_TIME <= sysdate AND MONITOR_END_TIME >= sysdate and search_type=703' ) if not result_list: log.debug('无任务 结束') return keywords = [] for result in result_list: keywords.extend(result[0].split(',')) parser_params = {'keywords': keywords} # 配置spider spider = Spider(tab_urls='WWA_search_app_urls', tab_site='WWA_search_app_site_info', tab_content='WWA_search_app_content_info', content_unique_key='title', begin_callback=begin_callback, end_callback=end_callback, parser_params=parser_params) # 添加parser spider.add_parser(yingyongbao_parser) spider.add_parser(android_market_parser) spider.add_parser(baidu_mobile_assistant_parser) spider.add_parser(mobile360_assistant_parser) spider.start()
def main(): def begin_callback(): db.update('GameApp_urls', {'depth': 0}, {'status': 0}) log.info('\n********** game_app begin **********') def end_callback(): log.info('\n********** game_app end **********') # 配置spider spider = Spider(tab_urls='GameApp_urls', tab_site='GameApp_site_info', tab_content='GameApp_content_info', begin_callback=begin_callback, end_callback=end_callback) # 添加parser spider.add_parser(yingyongbao_parser) spider.add_parser(android_market_parser) spider.add_parser(baidu_mobile_assistant_parser) spider.add_parser(mobile360_assistant_parser) spider.start()
def main(): db = OracleDB() mongodb = MongoDB() sql = 'select t.ID, t.monitor_type from TAB_MVMS_WEIBO_INFO t where monitor_status = 402' result_list = db.find(sql, fetch_one=False) if not result_list: log.debug('无任务 结束') return parser_params = result_list # for i in result: # parser_params.extend(str(i[0]).split(',')) def begin_callback(): log.info('\n********** WWA_weibo_info begin **********') mongodb.delete('WWA_weibo_info_urls') def end_callback(): # 导出数据 key_map = { 'id': 'int__id', 'release_time': 'date_release_time', 'come_from': 'str_come_from', 'content': 'clob_content', 'image_url': 'str_image_url', 'video_url': 'str_video_url', 'transpond_count': 'int_transpond_count', 'praise_count': 'int_praise_count', 'check_status': 'vint_301', 'weibo_id': 'int_weibo_id', 'article_url': 'str_url', 'violate_status': 'int_violate_id', 'sensitive_id': 'int_sensitive_id', 'record_time': 'date_record_time', 'SEXY_IMAGE_STATUS': 'str_sexy_image_status' } export = ExportData('WWA_weibo_info_info', 'tab_mvms_weibo_article_info', key_map, unique_key='ARTICLE_url', condition={ 'read_status': 0, "image_pron_status": 2 }) export.export_to_oracle() log.info('\n********** WWA_weibo_info end **********') # 配置spider spider = Spider(tab_urls='WWA_weibo_info_urls', tab_site='WWA_site_info', tab_content='WWA_weibo_info_info', parser_count=1, begin_callback=begin_callback, end_callback=end_callback, parser_params=parser_params) # 添加parser spider.add_parser(weibo_info_parser) spider.start()