class ProvinceFilter(): def __init__(self, province_name=PROVINCE): self._province_airs = [] self._db = OracleDB() if province_name: self._province_airs.append(province_name) province_id = self.load_province_id(province_name) if province_id: self._province_airs.extend( air[0] for air in self.load_province_air(province_id)) # self._province_airs.extend(town[0] for town in self.load_province_town(province_id)) else: # 全国 self._province_airs.extend(province[0] for province in self.load_province()) print(self._province_airs) def load_province_id(self, province_name): sql = "select t.id from TAB_MANAGE_PROVINCE_INFO t where t.province_name like '%{province_name}%'".format( province_name=province_name) result = self._db.find(sql) province_id = result[0][0] if result else None if not province_id: log.debug('TAB_MANAGE_PROVINCE_INFO 无 %s 省份' % province_name) return province_id def load_province(self): sql = "select province_name from TAB_MANAGE_PROVINCE_INFO" province_names = self._db.find(sql) return province_names def load_province_air(self, province_id): sql = "select t.area_name from TAB_MANAGE_AREA_INFO t where t.province_id = %s" % province_id province_air = self._db.find(sql) return province_air def load_province_town(self, province_id): sql = "select t.town_name from TAB_MANAGE_TOWN_INFO t where t.province_id = %s" % province_id province_town = self._db.find(sql) return province_town def find_contain_air(self, text): contain_airs = [] for air in self._province_airs: if air in text: contain_airs.append(air) return list(set(contain_airs))
class VipChecked(Singleton): def __init__(self): super(VipChecked, self).__init__() if not hasattr(self,'_vip_sites'): self._vip_sites = set() self._oracledb = OracleDB() self.load_vip_site() def load_vip_site(self): sql = 'select to_char(t.keyword2) from TAB_IOPM_CLUES t where zero_id = 7' sites = self._oracledb.find(sql) for site in sites: site_list = site[0].split(',') for site in site_list: if site: self._vip_sites.add(site) # print(self._vip_sites) def is_vip(self, content): is_vip = False for site in self._vip_sites: is_vip = (content or False) and ((site in content) or (content in site)) if is_vip: # print(site) break return int(is_vip)
class Keywords(): def __init__(self): self._oracledb = OracleDB() self._clues = self.get_clues() def get_clues(self): sql = 'select t.id clues_id,to_char(t.keyword2),to_char(t.keyword3),t.zero_id, FIRST_ID, second_id from TAB_IOPM_CLUES t where zero_id != 7' # 7 为传播途径 clues = self._oracledb.find(sql) return clues def get_keywords(self): keywords = [] for clue in self._clues: clue_id = clue[0] key2 = clue[1] key3 = clue[2] zero_id = clue[3] first_id = clue[4] second_id = clue[5] keys = format_keywords(key2) # 格式化线索词 for key in keys: #['新闻节目', '总理&主席', 'the xi factor'] unit_keys = key.replace('&', ' ') # [总理, 主席] keywords.append(unit_keys) return keywords
def get_clues(): db = OracleDB() sql = 'select t.id clues_id,to_char(t.keyword2),to_char(t.keyword3),t.name from TAB_IOPM_CLUES t where zero_id != 7' # 7 为传播途径 results = db.find(sql) clues_json = { "message": "查询成功", "status": 1, "data": [{ "clues_id": 104, "包含": "", "不包含": "", "线索": 2 }] } clues_json['data'] = [] file = open('clues/clues.csv', 'w+', encoding='utf8') file.write("线索,关键词\n") for result in results: print(result) data = { "线索id": result[0] if result[0] else "", "包含": "%s" % (result[1].replace('"', '“').replace('、', '')[:-1] if result[1][-1] == ',' else result[1].replace('"', '')) if result[1] else "", "不包含": "%s" % (result[2].replace('"', '“').replace('、', '')[:-1] if result[2][-1] == ',' else result[2].replace('"', '')) if result[2] else "", "线索": result[3] if result[3] else "" } # is_delete, keyword2 = delete_keys(result[2]) # if is_delete: # print('修改后的key-> ', keyword2) # sql = "update TAB_IOPM_CLUES t set t.keyword2 = to_clob('%s') where t.id = %s"%(keyword2, result[0]) # if db.update(sql): # print('更新数据库成功\n') print(data) # data['keyword2'] = format_keys(data['keyword2']) # data['keyword3'] = format_keys(data['keyword3']) clues_json["data"].append(data) file.write('"%s","%s"\n' % (data['线索'], data['包含'])) file.close() # clues_json = tools.dumps_json(clues_json) # print(clues_json) # tools.write_file('clues/clues.txt', clues_json) os.system('start clues\\') return clues_json
class CompareKeywords(): def __init__(self): self._oracledb = OracleDB() self._clues = self.get_clues() def get_clues(self): sql = 'select t.id clues_id,to_char(t.keyword2),to_char(t.keyword3),t.zero_id, FIRST_ID, second_id from TAB_IOPM_CLUES t where zero_id != 7' # 7 为传播途径 clues = self._oracledb.find(sql) return clues def get_contained_keys(self, text): ''' @summary: --------- @param text:比较的文本 @param keys:关键词列表 --------- @result: ''' keywords = [] clues_ids = [] zero_ids = [] first_ids = [] second_ids = [] keyword_clues = {} for clue in self._clues: clue_id = clue[0] key2 = clue[1] key3 = clue[2] zero_id = clue[3] first_id = clue[4] second_id = clue[5] keys = format_keywords(key2) # 格式化线索词 for key in keys: #['新闻节目', '总理&主席', 'the xi factor'] # 获取单元key 如 总理&主席 必须全包含 unit_keys = key.split('&') # [总理, 主席] for unit_key in unit_keys: if unit_key not in text: break else: keywords.extend(unit_keys) clues_ids.append(str(clue_id)) zero_ids.append(str(zero_id)) first_ids.append(str(first_id)) second_ids.append(str(second_id)) for unit_key in unit_keys: keyword_clues[unit_key] = clue_id return ','.join(set(keywords)), ','.join(set(clues_ids)), ','.join( set(zero_ids)), ','.join(set(first_ids)), ','.join( set(second_ids)), keyword_clues
class SyncES(): def __init__(self): self._es = ES() self._db = OracleDB() self._max_id = tools.read_file(STO_MAX_ID_FILE) self._max_id = self._max_id and eval(self._max_id) or {} def get_data(self, sql): return self._db.find(sql, to_json=True) def export_to_es(self, table, data, data_id): self._es.add(table=table, data=data, data_id=data_id) def sync_data(self, table, step=20): ''' @summary: 需要先把id设为主键 --------- @param sql: @param table: @param is_step: 分批导 0 位一次导入, 适合数据量不多情况。速度快 --------- @result: ''' max_id = self._max_id.get(table, 0) self._db.set_primary_key(table) while True: inner_sql = 'select * from %s where id > %d and rownum <= %d order by id' % ( table, max_id, step) datas = sync_es.get_data(inner_sql) if not datas: self.close() break for data in datas: data_id = data['ID'] data = tools.dumps_json(data) print(data) print(data_id) # print(data) max_id = data_id self.export_to_es(table, data, data_id) self._max_id[table] = max_id def close(self): tools.write_file(STO_MAX_ID_FILE, str(self._max_id))
def main(): db = OracleDB() mongodb = MongoDB() sql = 'select t.KEYWORD, t.monitor_type from TAB_MVMS_SEARCH_INFO t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time and search_type = 702' result_list = db.find(sql, fetch_one=False) if not result_list: log.debug('无任务 结束') return parser_params = {'result_list': result_list} # parser_params = [] # for i in result: # parser_params.extend(str(i[0]).split(',')) def begin_callback(): log.info('\n********** WWA_weibo_user begin **********') mongodb.delete('WWA_weibo_user_urls') def end_callback(): # 导出数据 key_map = { 'id': 'int__id', 'name': 'str_name', 'sex': 'int_sex', 'summary': 'str_summary', 'fans_count': 'int_fans_count', 'blog_verified': 'str_blog_verified', 'is_verified': 'int_is_verified', 'account_url': 'str_url', 'follow_count': 'int_follow_count', 'image_url': 'str_image_url', 'monitor_status': 'vint_401', 'SEARCH_TYPE' : 'vint_702', 'region' : 'str_area' } export = ExportData('WWA_weibo_user_info', 'tab_mvms_weibo_info', key_map, 'account_url') export.export_to_oracle() log.info('\n********** WWA_weibo_user end **********') # 配置spider spider = Spider(tab_urls = 'WWA_weibo_user_urls', tab_site = 'WWA_site_info', tab_content = 'WWA_weibo_user_info', parser_count = 1, begin_callback = begin_callback, end_callback = end_callback, parser_params = parser_params) # 添加parser spider.add_parser(weibo_user_parser) spider.start()
class EventFilter(threading.Thread): def __init__(self): super(EventFilter, self).__init__() self._db = OracleDB() self._event_knowledges = self.load_event_knowledges() def run(self): while True: tools.delay_time(60 * 60) print('更新事件知识库...') self._event_knowledges = self.load_event_knowledges() print('更新事件知识库完毕') def load_event_knowledges(self): ''' @summary: 801 时事政治 802 社会民生 803 教育改革 804 医疗卫生 805 科技舆情 806 意识形态(无) 807 政策法规 808 经济舆情(无) 809 生态文明 810 体育舆情(无) 811 突发安全(无) --------- --------- @result: ''' sql = 'select t.keyword, t.type from TAB_IOPM_EVENT_KNOWLEDEGE t' event_knowledges = self._db.find(sql) return event_knowledges def find_contain_event(self, text): contain_event_type = set() for event in self._event_knowledges: event_keyword = event[0] event_type = event[1] if event_keyword in text: contain_event_type.add(str(event_type)) return list(contain_event_type)
def main(): db = OracleDB() sql = 'select t.id clues_id,to_char(t.keyword1),to_char(t.keyword2),to_char(t.keyword3),t.zero_id from TAB_IOPM_CLUES t' results = db.find(sql) clues_json = { "message": "查询成功", "status": 1, "data": [{ "clues_id": 104, "keyword1": "", "keyword2": "", "keyword3": "", "zero_id": 2 }] } clues_json['data'] = [] for result in results: data = { "clues_id": result[0] if result[0] else "", "keyword1": "%s" % (result[1].replace('"', '“').replace('、', '')[:-1] if result[1][-1] == ',' else result[1].replace('"', '')) if result[1] else "", "keyword2": "%s" % (result[2].replace('"', '“').replace('、', '')[:-1] if result[2][-1] == ',' else result[2].replace('"', '')) if result[2] else "", "keyword3": "%s" % (result[3].replace('"', '“').replace('、', '')[:-1] if result[3][-1] == ',' else result[3].replace('"', '')) if result[3] else "", "zero_id": result[4] if result[4] else "" } clues_json["data"].append(data) clues_json = tools.dumps_json(clues_json) print(clues_json) tools.write_file('./clues.txt', clues_json)
def main(): db = MongoDB() oracle = OracleDB() def begin_callback(): #db.update('WWA_app_urls',{'depth':0}, {'status':0}) db.delete('WWA_search_app_urls') log.info('\n********** wwa begin **********') def end_callback(): log.info('\n********** wwa end **********') export_data.main() keywords = [] result_list = oracle.find( 'select keyword from TAB_MVMS_SEARCH_INFO where MONITOR_START_TIME <= sysdate AND MONITOR_END_TIME >= sysdate and search_type=703' ) if not result_list: log.debug('无任务 结束') return keywords = [] for result in result_list: keywords.extend(result[0].split(',')) parser_params = {'keywords': keywords} # 配置spider spider = Spider(tab_urls='WWA_search_app_urls', tab_site='WWA_search_app_site_info', tab_content='WWA_search_app_content_info', content_unique_key='title', begin_callback=begin_callback, end_callback=end_callback, parser_params=parser_params) # 添加parser spider.add_parser(yingyongbao_parser) spider.add_parser(android_market_parser) spider.add_parser(baidu_mobile_assistant_parser) spider.add_parser(mobile360_assistant_parser) spider.start()
def main(): oracledb = OracleDB() sql = 'select t.KEYWORD, t.monitor_type from TAB_MVMS_SEARCH_INFO t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time and search_type = 701' result_list = oracledb.find(sql) #[(keys, monitor_type),()] if not result_list: log.debug('无任务 结束') return # print(result_list) # keywords = [] # for result in result_list: # keywords.extend(result[0].split(',')) def begin_callback(): log.info('\n********** WWA_wechat_account begin **********') db = MongoDB() db.delete('WWA_wechat_account_url', {}) def end_callback(): log.info('\n********** WWA_wechat_account end **********') export_data.account_main() parser_params = {'result_list': result_list} # 配置spider spider = Spider(tab_urls='WWA_wechat_account_url', tab_site='WWA_wechat_site_info', tab_content='WWA_wechat_official_accounts', content_unique_key='account_id', parser_count=1, begin_callback=begin_callback, end_callback=end_callback, parser_params=parser_params) # 添加parser spider.add_parser(wechat_account_parser) spider.start()
def main(): oracledb = OracleDB() sql = 'select t.account_id, t.monitor_type from TAB_MVMS_WECHAT_INFO t where monitor_status = 402' result_list = oracledb.find(sql) if not result_list: log.debug('无任务 结束') return # keywords = [] # for result in result_list: # keywords.append(result[0]) def begin_callback(): log.info('\n********** WWA_wechat_article begin **********') db = MongoDB() db.delete('WWA_wechat_article_url', {}) def end_callback(): log.info('\n********** WWA_wechat_article end **********') export_data.article_main() parser_params = result_list # 配置spider spider = Spider(tab_urls='WWA_wechat_article_url', tab_site='WWA_wechat_site_info', tab_content='WWA_wechat_article', content_unique_key='title', parser_count=1, begin_callback=begin_callback, end_callback=end_callback, parser_params=parser_params) # 添加parser spider.add_parser(wechat_article_parser) spider.start()
def main(): db = OracleDB() sql = ''' select t.program_id, c.chan_name, program_name, d.name, t.image_url, t.official_blog from TAB_MMS_PROGRAM t left join tab_mam_chan c on c.chan_id = t.chan_id left join tab_mms_dictionary d on t.type = d.id and d.type = 2 ''' # where t.program_id = 226 program_info = db.find(sql) def begin_callback(): log.info('\n********** news begin **********') # 更新任务状态 doing def end_callback(): log.info('\n********** news end **********') # 配置spider spider = Spider(tab_urls='mms_urls', begin_callback=begin_callback, end_callback=end_callback, delete_tab_urls=True, parser_params=program_info) # 添加parser # spider.add_parser(iqiyi_hot_parser) spider.add_parser(iqiyi_search_parser) # spider.add_parser(weibo_user_parser) # spider.add_parser(weibo_article_parser) spider.start()
def main(): db = OracleDB() mongodb = MongoDB() sql = 'select t.ID, t.monitor_type from TAB_MVMS_WEIBO_INFO t where monitor_status = 402' result_list = db.find(sql, fetch_one=False) if not result_list: log.debug('无任务 结束') return parser_params = result_list # for i in result: # parser_params.extend(str(i[0]).split(',')) def begin_callback(): log.info('\n********** WWA_weibo_info begin **********') mongodb.delete('WWA_weibo_info_urls') def end_callback(): # 导出数据 key_map = { 'id': 'int__id', 'release_time': 'date_release_time', 'come_from': 'str_come_from', 'content': 'clob_content', 'image_url': 'str_image_url', 'video_url': 'str_video_url', 'transpond_count': 'int_transpond_count', 'praise_count': 'int_praise_count', 'check_status': 'vint_301', 'weibo_id': 'int_weibo_id', 'article_url': 'str_url', 'violate_status': 'int_violate_id', 'sensitive_id': 'int_sensitive_id', 'record_time': 'date_record_time', 'SEXY_IMAGE_STATUS': 'str_sexy_image_status' } export = ExportData('WWA_weibo_info_info', 'tab_mvms_weibo_article_info', key_map, unique_key='ARTICLE_url', condition={ 'read_status': 0, "image_pron_status": 2 }) export.export_to_oracle() log.info('\n********** WWA_weibo_info end **********') # 配置spider spider = Spider(tab_urls='WWA_weibo_info_urls', tab_site='WWA_site_info', tab_content='WWA_weibo_info_info', parser_count=1, begin_callback=begin_callback, end_callback=end_callback, parser_params=parser_params) # 添加parser spider.add_parser(weibo_info_parser) spider.start()
class CheckNewArticle(): def __init__(self): self._oracledb = OracleDB() self._redisdb = RedisDB() self._wechat_sogo = WechatSogou() def get_wait_check_account(self): ''' @summary: --------- @param : --------- @result: ''' # 取抓取完的公众号,且最近发布时间已过去两小时,则再次监测是否又发布新文章 before_tow_hours = tools.timestamp_to_date( tools.get_current_timestamp() - 60 * 60 * 2) sql = ''' select t.id, t.domain, t.name, to_char(t.last_article_release_time, 'yyyy-mm-dd hh24:mi:ss'), t.biz from TAB_IOPM_SITE t where t.biz is not null and mointor_status = 701 and t.spider_status = 603 and (t.last_article_release_time is null or t.last_article_release_time <= to_date('{}', 'yyyy-mm-dd hh24:mi:ss')) '''.format(before_tow_hours) accounts = self._oracledb.find(sql) # 若无抓取完的公众号,且redis中无抓取任务,则数据库中非603任务可能为丢失任务,需要重新下发 if not accounts and not self._redisdb.sget_count('wechat:account'): sql = ''' select t.id, t.domain, t.name, to_char(t.last_article_release_time, 'yyyy-mm-dd hh24:mi:ss'), t.biz from TAB_IOPM_SITE t where t.biz is not null and mointor_status = 701 and t.spider_status != 603 ''' accounts = self._oracledb.find(sql) return accounts def check_new_article(self, account): oralce_id, account_id, account_name, last_article_release_time, biz = account article_release_time = self._wechat_sogo.get_article_release_time( account_id=account_id, account=account_name) print(article_release_time) if article_release_time: last_article_release_time = last_article_release_time or '' if article_release_time >= tools.get_current_date( '%Y-%m-%d' ) and article_release_time > last_article_release_time: print('{} 有新文章发布,等待抓取。 发布时间:{}'.format(account_name, article_release_time)) sql = ''' update TAB_IOPM_SITE t set t.spider_status = 601, t.last_article_release_time = to_date('{}', 'yyyy-mm-dd hh24:mi:ss') where id = {} '''.format(article_release_time, oralce_id) # 多线程, 数据库需每个线程持有一个 oracledb = OracleDB() oracledb.update(sql) oracledb.close() # 入redis, 作为微信爬虫的任务池 data = (oralce_id, account_id, account_name, last_article_release_time, biz) self._redisdb.sadd('wechat:account', data)
class TaskManager(): def __init__(self): self._oracledb = OracleDB() self._redisdb = RedisDB() self._news_url_table = 'news:news_urls' self._news_urls_dupefilter = 'news:news_urls_dupefilter' def get_task_count(self): ''' @summary: redis 中是否有待做的url --------- --------- @result: ''' return self._redisdb.zget_count(self._news_url_table) def get_ever_depth_count(self, total_depth=5): ''' @summary: --------- @param total_depth: 不包含。 以客户角度的层数 --------- @result: ''' depth_count_info = {} total_count = 0 for depth in range(total_depth): key = '第%s层url数' % (depth + 1) depth_count_info[key] = self._redisdb.sget_count( self._news_urls_dupefilter + str(depth)) total_count += depth_count_info[key] depth_count_info['总url数'] = total_count return depth_count_info def get_task_from_oracle(self): tasks = [] offset = 0 while True: # 取任务 task_sql = ''' select * from (select t.id, t.name, t.position, t.url, t.depth, rownum r from TAB_IOPM_SITE t where classify = 1 and t.mointor_status = 701 and (t.position != 35 or t.position is null) and rownum < {page_size}) where r >= {offset} '''.format(page_size=offset + ONE_PAGE_SIZE, offset=offset) results = self._oracledb.find(task_sql) offset += ONE_PAGE_SIZE if not results: break # 拼装成json格式的url for task in results: website_id = task[0] website_name = task[1] website_position = task[2] website_url = task[3] website_domain = tools.get_domain(website_url) spider_depth = task[4] remark = { 'website_name': website_name, 'website_position': website_position, 'website_url': website_url, 'website_domain': website_domain, 'spider_depth': spider_depth } url_dict = { 'site_id': 1, 'url': website_url, 'depth': 0, 'remark': remark, 'retry_times': 0 } tasks.append(url_dict) return tasks def add_task_to_redis(self, tasks): for task in tasks: url = task.get('url') if url: url_id = tools.get_sha1(url) if self._redisdb.sadd(self._news_urls_dupefilter, url_id): self._redisdb.zadd(self._news_url_table, task, prioritys=0) # 下面是统计每层url数量用的表 self._redisdb.sadd('news:news_urls_dupefilter0', url_id) def clear_task(self): # 清空url指纹表 self._redisdb.sdelete('news:news_urls_dupefilter') # 下面是统计每层url数量用的表 self._redisdb.sdelete('news:news_urls_dupefilter0') self._redisdb.sdelete('news:news_urls_dupefilter1') self._redisdb.sdelete('news:news_urls_dupefilter2') self._redisdb.sdelete('news:news_urls_dupefilter3') self._redisdb.sdelete('news:news_urls_dupefilter4')
class WechatSogou(): def __init__(self): self._db = OracleDB() def deal_null_biz(self): sql = 'select id, name, domain from TAB_IOPM_SITE t where classify = 2 and t.biz is null' accounts_info = self._db.find(sql) for account_info in accounts_info: print(account_info) _id = account_info[0] account = account_info[1] account_id = account_info[2] account_info = self.get_account_info(account_id, account) log.debug(tools.dumps_json(account_info)) if account_info.get('__biz'): account = account or account_info.get('account') account_id = account_id or account_info.get('account_id') __biz = account_info.get('__biz') or '' sql = "update TAB_IOPM_SITE set name = '%s', domain = '%s', biz = '%s' where id = %s"%(account, account_id, __biz, _id) log.debug(sql) self._db.update(sql) elif not account_info.get('check_info'): log.debug('查无此公众号 :%s'% account) tools.delay_time(60) def get_account_info(self, account_id = '', account = ''): keyword = account_id or account # 账号id优先 keyword = keyword.lower() log.debug('search keywords ' + keyword) headers = { "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8", "Accept-Encoding": "gzip, deflate", "Cookie": "IPLOC=CN1100; ld=4yllllllll2zj$kYlllllVo3$xklllllWT89eyllll9lllllRklll5@@@@@@@@@@; SUV=00E3555B7B7CC4C55A0AA8195254D871; CXID=150E3ABE3C35F9E55217835F7720E719; ABTEST=8|1510801558|v1; LSTMV=418%2C28; LCLKINT=2070; ad=8kllllllll2zRlPflllllVoSynYlllllWT89eyllllwlllll9Cxlw@@@@@@@@@@@; SUID=C5C47C7B1508990A000000005A0AA818; weixinIndexVisited=1; JSESSIONID=aaa-1KvS1lhung8pB9v8v; sct=20; PHPSESSID=k3c9psast34njs32vjm3pas3l1; SUIR=E8E851562D28732A6B711C802DECBC6F; seccodeErrorCount=1|Tue, 28 Nov 2017 11:11:05 GMT; SNUID=A1A0181864613C6A610582E26446EC9A; successCount=1|Tue, 28 Nov 2017 11:11:22 GMT", "Host": "weixin.sogou.com" } proxies = ip_proxies.get_proxies() headers["User-Agent"] = ip_proxies.get_user_agent() url = 'http://weixin.sogou.com/weixin?type=1&s_from=input&query=%s&ie=utf8&_sug_=n&_sug_type_='%(keyword) html, request = tools.get_html_by_requests(url, headers = headers, proxies = proxies) # 公众号信息块 regex = '<!-- a -->(.*?)<!-- z -->' account_blocks = tools.get_info(html, regex) regex = '<input type=text name="c" value="" placeholder="(.*?)" id="seccodeInput">' check_info = tools.get_info(html, regex, fetch_one = True) if check_info: log.debug('''取公众号列表 : %s url : %s '''%(check_info, url) ) account_info = {'check_info' : check_info} for account_block in account_blocks: regex = '<a.*?account_name.*?>(.*?)</a>' account = tools.get_info(account_block, regex, fetch_one = True) account = tools.del_html_tag(account) regex = '<label name="em_weixinhao">(.*?)</label>' account_id = tools.get_info(account_block, regex, fetch_one = True) regex = '<a.*?account_name.*?href="(.*?)">' account_url = tools.get_info(account_block, regex, fetch_one = True) account_url = account_url.replace('&',"&") __biz = '' if account.lower() == keyword or account_id.lower() == keyword: # 取biz headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8", "Host": "mp.weixin.qq.com", "Connection": "keep-alive", "Referer": "http://weixin.sogou.com/weixin?type=1&s_from=input&query=%E6%B3%B8%E5%B7%9E%E7%94%B5%E8%A7%86%E5%8F%B0&ie=utf8&_sug_=n&_sug_type_=", "Cookie": "RK=XbmCLga7Pm; pgv_pvi=9492080640; noticeLoginFlag=1; ua_id=D8NYmIGpieSNub9rAAAAAGNz-Z1l4qe4x5WdelXsnmk=; xid=f3e1fb8a5fe8452b1d60a4059706017a; openid2ticket_opcqcjrNnRf62olc2Aj4PIU2hq9E=iNiYDe6xyIQ59zJxdOH0fmku4sXhFTq299CHyxYNJH8=; mm_lang=zh_CN; uin=o0564773807; skey=@Q46eRUFUE; pt2gguin=o0564773807; ptisp=cnc; ptcz=8deaf5ec9f0b3c27516ab6b735a6f3af99bc3517b922f52917b0ed5c6d82002f; o_cookie=564773807; pgv_info=ssid=s5664129956; pgv_pvid=8949522462; pac_uid=1_564773807; sig=h017174242e513ba3ec2450e63ac7a82981b57f85995f81aa47747b23e28ab077954627089b9d7fc947; pgv_si=s7924323328", "Accept-Encoding": "gzip, deflate, br", "Cache-Control": "max-age=0", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36", "Upgrade-Insecure-Requests": "1" } proxies = ip_proxies.get_proxies() headers["User-Agent"] = ip_proxies.get_user_agent() html, request = tools.get_html_by_requests(account_url, proxies = proxies) print(html) regex = 'var biz = "(.*?)"' __biz = tools.get_info(html, regex, fetch_one = True) log.debug(''' 公众号名称 %s 公众号账号 %s 账号url %s __biz %s '''%(account, account_id, account_url, __biz)) account_info = { 'account' : account, 'account_id' : account_id, '__biz' : __biz, } return account_info
def add_anchor_info(table, site_id, title='', name='', image_url='', room_id='', room_url='', video_path='', watched_count='', fans_count='', sex='', age='', address='', live_view=1, watched_count_url=''): ''' @summary: --------- @param table: 表名 @param site_id: 网站id @param name: 主播名 @param image_url: 贴图地址 @param room_id: 房间号 @param room_url: 房间网页的url @param video_path: 房间视频流地址 @param watched_count: 观众数 @param fans_count: 粉丝数 @param sex: 性别 @param age: 年龄 @param address: 主播所在地址(城市) @param live_view: 直播状态(0 未直播 1 直播) @param watched_count_url: 实时观众数地址 --------- @result: ''' #违规知识库检索 task_id = 0 violate_content = '' #-交验-- from db.oracledb import OracleDB oracle_db = OracleDB() sql = 'select t.name, t.keyword, t.task_id from tab_nbsp_violate_knowledge t where t.monitor_start_time <= sysdate and sysdate <= t.monitor_end_time' results = oracle_db.find(sql) #[('色情低俗', '性感,枪支,格斗,脱衣,透视,胸器', 1)] for result in results: name_, keywords, task_id_ = result keywords = keywords.split(',') for keyword in keywords: if name.find(keyword) != -1: task_id = task_id_ violate_content = name anchor_info_dict = { 'site_id': site_id, 'title': title, 'task_id': task_id, 'violate_content': violate_content, 'name': name, 'image_url': image_url, 'sex': sex, 'age': age, 'address': address, 'fans_count': fans_count, 'watched_count': watched_count, 'room_id': room_id, 'room_url': room_url, 'video_path': video_path, 'live_view': live_view, 'record_time': tools.get_current_date(), 'watched_count_url': watched_count_url, 'read_status': 0, 'sexy_image_status': '', 'sexy_image_url': '', 'image_pron_status': 0 } if not db.add(table, anchor_info_dict): anchor_info_dict.pop('_id') anchor_info_dict.pop('sexy_image_status') anchor_info_dict.pop('sexy_image_url') anchor_info_dict.pop('image_pron_status') db.update(table, {'room_id': room_id}, anchor_info_dict)
def add_WWA_search_app_info(table, site_id, url, title='', summary='', update_info='', score='', author='', app_url='', image_url='', software_size='', tag='', platform='', download_count='', release_time='', language='', sensitive_id='', read_status=0): ''' @summary: --------- @param title: 标题 @param site_id: 网站id @param summary: 简介 @param update_info: 更新信息 @param socre: 评分 @param author: 作者 @param url: 原文url @param app_url: app下载的url @param image_url : 图片url(多个url逗号分割) @param classify_id: 分类 @param software_size: 大小 @param tag: 版本 | @param platform: 平台(ios / android) @param download_count:下载次数 @param release_time: 发布时间 @param record_time: 记录时间 @param sensitive_id: varchar|||敏感信息id(多个敏感信息id用逗号分割) @param read_status: 读取状态(0没读, 1读取) --------- @result: ''' # 过滤掉不符合的app from db.oracledb import OracleDB oracle_db = OracleDB() sql = 'select keyword from TAB_MVMS_SEARCH_INFO t where search_type = 703' results = oracle_db.find(sql) #[('天天快报,今日头条,黑龙江',)] is_usefull = False text_content = title + summary + update_info + author for result in results: keywords = result[0] keywords = keywords.split(',') for keyword in keywords: if keyword in text_content: is_usefull = True break if is_usefull: break if not is_usefull: return if language == '中文': language = 601 elif language == '英文': language = 602 else: language = 603 title = tools.del_html_tag(title) gameApp_info_dict = { 'site_id': site_id, 'url': url, 'summary': tools.del_html_tag(summary, except_line_break=True), 'title': title, 'update_info': tools.del_html_tag(update_info, except_line_break=True), 'score': score, 'author': author, 'app_url': app_url, 'image_url': image_url, 'software_size': software_size, 'tag': tag, 'platform': platform, 'download_count': download_count, 'release_time': release_time, 'record_time': tools.get_current_date(), 'language': language, 'sensitive_id': sensitive_id, 'read_status': 0, 'sexy_image_status': '', 'sexy_image_url': '', 'image_pron_status': 0 } db.add(table, gameApp_info_dict)
def main(): db = OracleDB() # 查文章 sql = ''' select * from (select rownum r, id, title from tab_iopm_article_info where rownum >= 1) where r <= 100000 ''' articles = db.find(sql) # 查热点 sql = 'select id, title from tab_iopm_hot_info' hots = db.find(sql) for article in articles: max_similar = { 'similarity': 0, 'hot_id': -1, 'article_id': -1, 'hot_title': '' } # 最相似的文章 similarity表示相似度(0~1) article_id = article[1] article_text = article[2] for hot in hots: hot_id = hot[0] hot_text = hot[1] similarity = compare_text(hot_text, article_text) # print(''' # article_text %s # hot_text %s # similarity %s # '''%(article_text, hot_text, similarity)) if similarity > max_similar['similarity']: max_similar['similarity'] = similarity max_similar['hot_id'] = hot_id max_similar['article_id'] = article_id max_similar['hot_title'] = article_text if len(hot_text) > len( article_text) else hot_text if max_similar['similarity'] > SIMILARITY: sql = 'update tab_iopm_article_info set hot_id = %s where id = %s' % ( max_similar['hot_id'], max_similar['article_id']) db.update(sql) sql = "update tab_iopm_hot_info set hot = hot + 1, title = '%s' where id = %s" % ( max_similar['hot_title'], max_similar['hot_id']) db.update(sql) else: sql = 'select sequence.nextval from dual' hot_id = db.find(sql)[0][0] sql = "insert into tab_iopm_hot_info (id, title, hot) values (%s, '%s', 1)" % ( hot_id, article_text) db.add(sql) sql = 'update tab_iopm_article_info set hot_id = %s where id = %s' % ( hot_id, article_id) db.update(sql) sql = 'select id, title from tab_iopm_hot_info' hots = db.find(sql)
def main(): search_task_sleep_time = int( tools.get_conf_value('config.conf', 'task', 'search_task_sleep_time')) db = OracleDB() # 更新符合日期条件的任务状态 未做 sql = 'update tab_ivms_task_info t set t.task_status = 501 where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time' db.update(sql) # 更新关键词状态 未做 sql = 'update tab_ivms_task_keyword k set k.finish_status = 601 where k.task_id in (select t.task_id from tab_ivms_task_info t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time)' db.update(sql) while True: # 查任务 log.debug('查询任务...') sql = 'select t.task_id from TAB_IVMS_TASK_INFO t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time and t.task_status = 501' result = db.find(sql, fetch_one=True) if not result: break task_id = result[0] while True: # 查看是否有正在执行的任务 sql = 'select t.* from TAB_IVMS_TASK_KEYWORD t where t.task_id = %d and finish_status = 602' % task_id do_task = db.find(sql, fetch_one=True) if do_task: time.sleep(search_task_sleep_time) continue sql = 'select t.* from TAB_IVMS_TASK_KEYWORD t where t.task_id = %d and finish_status = 601' % task_id result = db.find(sql, fetch_one=True) if not result: break keyword_id = result[0] task_id = result[1] search_keyword1 = [] search_keyword2 = result[2].split(',') if result[2] else [] search_keyword3 = result[3].split(',') if result[3] else [] def begin_callback(): log.info('\n********** VA begin **********') # 更新任务状态 正在做 sql = 'update TAB_IVMS_TASK_INFO set task_status = 502 where task_id = %d' % task_id db.update(sql) # 更新关键词状态 正在做 sql = 'update tab_ivms_task_keyword set finish_status = 602 where id = %d' % keyword_id db.update(sql) def end_callback(): # 更新关键词状态 做完 sql = 'update tab_ivms_task_keyword set finish_status = 603 where id = %d' % keyword_id db.update(sql) # 如果该任务的所有关键词都做完 则更新任务状态为做完 sql = 'select t.* from tab_ivms_task_keyword t where task_id = %d and finish_status = 601' % task_id results = db.find(sql) if not results: # 导出数据 key_map = { 'program_id': 'vint_sequence.nextval', 'search_type': 'int_search_type', 'program_name': 'str_title', 'program_url': 'str_url', 'release_date': 'date_release_time', 'image_url': 'str_image_url', 'program_content': 'str_content', 'task_id': 'vint_%d' % task_id, 'keyword': 'str_keyword', 'keyword_count': 'int_keyword_count', 'check_status': 'vint_202' } export = ExportData('VA_content_info', 'tab_ivms_program_info', key_map, 'program_url') export.export_to_oracle() # 更新任务状态 做完 sql = 'update TAB_IVMS_TASK_INFO set task_status = 503 where task_id = %d' % task_id db.update(sql) log.info('\n********** VA end **********') # 配置spider spider = Spider(tab_urls='VA_urls', tab_site='VA_site_info', tab_content='VA_content_info', parser_count=1, begin_callback=begin_callback, end_callback=end_callback, search_keyword1=search_keyword1, search_keyword2=search_keyword2, search_keyword3=search_keyword3) # 添加parser spider.add_parser(baidu_parser) spider.add_parser(magnet_parser) spider.add_parser(netdisk_parser) spider.add_parser(weibo_parser) spider.add_parser(wechat_parser) spider.add_parser(soubaidupan_parser) spider.add_parser(douban_parser) spider.start() time.sleep(search_task_sleep_time)
class WechatService(): _todo_accounts = collections.deque() _rownum = 1 _is_done = False def __init__(self): self._db = OracleDB() self._es = ES() self.__load_todo_account() def __load_todo_account(self): if not WechatService._todo_accounts: sql = ''' select * from (select rownum r, t.id, t.domain, t.biz from TAB_IOPM_SITE t where t.biz is not null and rownum < {size}) where r >= {rownum} '''.format(rownum=WechatService._rownum, size=WechatService._rownum + SIZE) results = self._db.find(sql) if not results: WechatService._is_done = True WechatService._rownum = 1 self.__load_todo_account() else: WechatService._todo_accounts = collections.deque( results) # 转为队列 WechatService._rownum += SIZE def get_next_account(self): ''' @summary: --------- --------- @result: 返回biz, 是否已做完一圈 (biz, True) ''' if not WechatService._todo_accounts: self.__load_todo_account() next_account_info = WechatService._todo_accounts.popleft() next_account_id = next_account_info[2] next_account_biz = next_account_info[3] next_account = next_account_id, next_account_biz, WechatService._is_done # 重置_is_done 状态 WechatService._is_done = False return next_account def is_exist(self, table, data_id): if self._es.get(table, data_id=data_id, doc_type=table): return True else: return False def add_article_info(self, article_info): ''' @summary: --------- @param article_info: --------- @result: ''' log.debug(''' -----文章信息----- %s''' % tools.dumps_json(article_info)) self._es.add('wechat_article', article_info, article_info.get('article_id')) def add_account_info(self, account_info): log.debug(''' -----公众号信息----- %s''' % tools.dumps_json(account_info)) self._es.add('wechat_account', account_info, account_info.get('__biz'))