def get_clues(): db = OracleDB() sql = 'select t.id clues_id,to_char(t.keyword2),to_char(t.keyword3),t.name from TAB_IOPM_CLUES t where zero_id != 7' # 7 为传播途径 results = db.find(sql) clues_json = { "message": "查询成功", "status": 1, "data": [{ "clues_id": 104, "包含": "", "不包含": "", "线索": 2 }] } clues_json['data'] = [] file = open('clues/clues.csv', 'w+', encoding='utf8') file.write("线索,关键词\n") for result in results: print(result) data = { "线索id": result[0] if result[0] else "", "包含": "%s" % (result[1].replace('"', '“').replace('、', '')[:-1] if result[1][-1] == ',' else result[1].replace('"', '')) if result[1] else "", "不包含": "%s" % (result[2].replace('"', '“').replace('、', '')[:-1] if result[2][-1] == ',' else result[2].replace('"', '')) if result[2] else "", "线索": result[3] if result[3] else "" } # is_delete, keyword2 = delete_keys(result[2]) # if is_delete: # print('修改后的key-> ', keyword2) # sql = "update TAB_IOPM_CLUES t set t.keyword2 = to_clob('%s') where t.id = %s"%(keyword2, result[0]) # if db.update(sql): # print('更新数据库成功\n') print(data) # data['keyword2'] = format_keys(data['keyword2']) # data['keyword3'] = format_keys(data['keyword3']) clues_json["data"].append(data) file.write('"%s","%s"\n' % (data['线索'], data['包含'])) file.close() # clues_json = tools.dumps_json(clues_json) # print(clues_json) # tools.write_file('clues/clues.txt', clues_json) os.system('start clues\\') return clues_json
def __init__(self): super(VipChecked, self).__init__() if not hasattr(self,'_vip_sites'): self._vip_sites = set() self._oracledb = OracleDB() self.load_vip_site()
class SyncES(): def __init__(self): self._es = ES() self._db = OracleDB() self._max_id = tools.read_file(STO_MAX_ID_FILE) self._max_id = self._max_id and eval(self._max_id) or {} def get_data(self, sql): return self._db.find(sql, to_json=True) def export_to_es(self, table, data, data_id): self._es.add(table=table, data=data, data_id=data_id) def sync_data(self, table, step=20): ''' @summary: 需要先把id设为主键 --------- @param sql: @param table: @param is_step: 分批导 0 位一次导入, 适合数据量不多情况。速度快 --------- @result: ''' max_id = self._max_id.get(table, 0) self._db.set_primary_key(table) while True: inner_sql = 'select * from %s where id > %d and rownum <= %d order by id' % ( table, max_id, step) datas = sync_es.get_data(inner_sql) if not datas: self.close() break for data in datas: data_id = data['ID'] data = tools.dumps_json(data) print(data) print(data_id) # print(data) max_id = data_id self.export_to_es(table, data, data_id) self._max_id[table] = max_id def close(self): tools.write_file(STO_MAX_ID_FILE, str(self._max_id))
def __init__(self, province_name=PROVINCE): self._province_airs = [] if province_name: self._db = OracleDB() self._province_airs.append(province_name) province_id = self.load_province_id(province_name) if province_id: self._province_airs.extend( air[0] for air in self.load_province_air(province_id)) self._province_airs.extend( town[0] for town in self.load_province_town(province_id)) print(self._province_airs)
class ProvinceFilter(): def __init__(self, province_name=PROVINCE): self._province_airs = [] self._db = OracleDB() if province_name: self._province_airs.append(province_name) province_id = self.load_province_id(province_name) if province_id: self._province_airs.extend( air[0] for air in self.load_province_air(province_id)) # self._province_airs.extend(town[0] for town in self.load_province_town(province_id)) else: # 全国 self._province_airs.extend(province[0] for province in self.load_province()) print(self._province_airs) def load_province_id(self, province_name): sql = "select t.id from TAB_MANAGE_PROVINCE_INFO t where t.province_name like '%{province_name}%'".format( province_name=province_name) result = self._db.find(sql) province_id = result[0][0] if result else None if not province_id: log.debug('TAB_MANAGE_PROVINCE_INFO 无 %s 省份' % province_name) return province_id def load_province(self): sql = "select province_name from TAB_MANAGE_PROVINCE_INFO" province_names = self._db.find(sql) return province_names def load_province_air(self, province_id): sql = "select t.area_name from TAB_MANAGE_AREA_INFO t where t.province_id = %s" % province_id province_air = self._db.find(sql) return province_air def load_province_town(self, province_id): sql = "select t.town_name from TAB_MANAGE_TOWN_INFO t where t.province_id = %s" % province_id province_town = self._db.find(sql) return province_town def find_contain_air(self, text): contain_airs = [] for air in self._province_airs: if air in text: contain_airs.append(air) return list(set(contain_airs))
def main(): db = OracleDB() mongodb = MongoDB() sql = 'select t.KEYWORD, t.monitor_type from TAB_MVMS_SEARCH_INFO t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time and search_type = 702' result_list = db.find(sql, fetch_one=False) if not result_list: log.debug('无任务 结束') return parser_params = {'result_list': result_list} # parser_params = [] # for i in result: # parser_params.extend(str(i[0]).split(',')) def begin_callback(): log.info('\n********** WWA_weibo_user begin **********') mongodb.delete('WWA_weibo_user_urls') def end_callback(): # 导出数据 key_map = { 'id': 'int__id', 'name': 'str_name', 'sex': 'int_sex', 'summary': 'str_summary', 'fans_count': 'int_fans_count', 'blog_verified': 'str_blog_verified', 'is_verified': 'int_is_verified', 'account_url': 'str_url', 'follow_count': 'int_follow_count', 'image_url': 'str_image_url', 'monitor_status': 'vint_401', 'SEARCH_TYPE' : 'vint_702', 'region' : 'str_area' } export = ExportData('WWA_weibo_user_info', 'tab_mvms_weibo_info', key_map, 'account_url') export.export_to_oracle() log.info('\n********** WWA_weibo_user end **********') # 配置spider spider = Spider(tab_urls = 'WWA_weibo_user_urls', tab_site = 'WWA_site_info', tab_content = 'WWA_weibo_user_info', parser_count = 1, begin_callback = begin_callback, end_callback = end_callback, parser_params = parser_params) # 添加parser spider.add_parser(weibo_user_parser) spider.start()
class VipChecked(Singleton): def __init__(self): super(VipChecked, self).__init__() if not hasattr(self,'_vip_sites'): self._vip_sites = set() self._oracledb = OracleDB() self.load_vip_site() def load_vip_site(self): sql = 'select to_char(t.keyword2) from TAB_IOPM_CLUES t where zero_id = 7' sites = self._oracledb.find(sql) for site in sites: site_list = site[0].split(',') for site in site_list: if site: self._vip_sites.add(site) # print(self._vip_sites) def is_vip(self, content): is_vip = False for site in self._vip_sites: is_vip = (content or False) and ((site in content) or (content in site)) if is_vip: # print(site) break return int(is_vip)
class Keywords(): def __init__(self): self._oracledb = OracleDB() self._clues = self.get_clues() def get_clues(self): sql = 'select t.id clues_id,to_char(t.keyword2),to_char(t.keyword3),t.zero_id, FIRST_ID, second_id from TAB_IOPM_CLUES t where zero_id != 7' # 7 为传播途径 clues = self._oracledb.find(sql) return clues def get_keywords(self): keywords = [] for clue in self._clues: clue_id = clue[0] key2 = clue[1] key3 = clue[2] zero_id = clue[3] first_id = clue[4] second_id = clue[5] keys = format_keywords(key2) # 格式化线索词 for key in keys: #['新闻节目', '总理&主席', 'the xi factor'] unit_keys = key.replace('&', ' ') # [总理, 主席] keywords.append(unit_keys) return keywords
def export_to_oracle(self, source_table='', aim_table='', key_map='', unique_key=None, unique_key_mapping_source_key=None, update_read_status=True, condition={'read_status': 0}, datas=[], callback='', sync_to_es=False): if aim_table: if self._aim_table != aim_table: self._is_set_unique_key = False self._es = ES() if sync_to_es else '' self._mongodb = MongoDB() if source_table else '' self._source_table = source_table self._aim_table = aim_table self._key_map = key_map self._unique_key = unique_key self._export_count = 0 self._update_count = 0 self._unique_key_mapping_source_key = unique_key_mapping_source_key self._update_read_status = update_read_status if not datas else False self._condition = condition self._datas = datas self._callback = callback self._sync_to_es = sync_to_es self._es = None self._aim_db = OracleDB() self._is_oracle = True return self.__export()
def main(): db = OracleDB() sql = 'select t.id clues_id,to_char(t.keyword1),to_char(t.keyword2),to_char(t.keyword3),t.zero_id from TAB_IOPM_CLUES t' results = db.find(sql) clues_json = { "message": "查询成功", "status": 1, "data": [{ "clues_id": 104, "keyword1": "", "keyword2": "", "keyword3": "", "zero_id": 2 }] } clues_json['data'] = [] for result in results: data = { "clues_id": result[0] if result[0] else "", "keyword1": "%s" % (result[1].replace('"', '“').replace('、', '')[:-1] if result[1][-1] == ',' else result[1].replace('"', '')) if result[1] else "", "keyword2": "%s" % (result[2].replace('"', '“').replace('、', '')[:-1] if result[2][-1] == ',' else result[2].replace('"', '')) if result[2] else "", "keyword3": "%s" % (result[3].replace('"', '“').replace('、', '')[:-1] if result[3][-1] == ',' else result[3].replace('"', '')) if result[3] else "", "zero_id": result[4] if result[4] else "" } clues_json["data"].append(data) clues_json = tools.dumps_json(clues_json) print(clues_json) tools.write_file('./clues.txt', clues_json)
def main(): db = MongoDB() oracle = OracleDB() def begin_callback(): #db.update('WWA_app_urls',{'depth':0}, {'status':0}) db.delete('WWA_search_app_urls') log.info('\n********** wwa begin **********') def end_callback(): log.info('\n********** wwa end **********') export_data.main() keywords = [] result_list = oracle.find( 'select keyword from TAB_MVMS_SEARCH_INFO where MONITOR_START_TIME <= sysdate AND MONITOR_END_TIME >= sysdate and search_type=703' ) if not result_list: log.debug('无任务 结束') return keywords = [] for result in result_list: keywords.extend(result[0].split(',')) parser_params = {'keywords': keywords} # 配置spider spider = Spider(tab_urls='WWA_search_app_urls', tab_site='WWA_search_app_site_info', tab_content='WWA_search_app_content_info', content_unique_key='title', begin_callback=begin_callback, end_callback=end_callback, parser_params=parser_params) # 添加parser spider.add_parser(yingyongbao_parser) spider.add_parser(android_market_parser) spider.add_parser(baidu_mobile_assistant_parser) spider.add_parser(mobile360_assistant_parser) spider.start()
def main(): db = OracleDB() sql = 'update tab_nbsp_anchor_info t set t.live_view = 0' db.update(sql) # 导出数据 # 主播信息 key_map = { 'id': 'vint_sequence.nextval', 'room_id': 'int_room_id', 'name': 'str_name', 'sex': 'int_sex', 'age': 'int_age', 'address': 'str_address', 'image_url': 'str_image_url', 'fans_count': 'int_fans_count', 'watched_count': 'int_watched_count', 'room_url': 'str_room_url', 'video_path': 'str_video_path', 'site_id': 'int_site_id', 'record_time': 'date_record_time', 'live_view': 'int_live_view', 'monitor_status': 'vint_401', 'json_data_url': 'str_watched_count_url' } export_data = ExportData(source_table = 'LiveApp_anchor_info', aim_table = 'tab_nbsp_anchor_info', key_map = key_map, unique_key = 'room_id', update_read_status = False, unique_key_mapping_source_key = {'room_id':'int_room_id'}) export_data.export_to_oracle() # 违规信息 key_map = { 'id' : 'vint_sequence.nextval', 'TASK_ID':'int_task_id', 'ANCHOR_ID':'int_room_id', 'FOUND_TIME':'date_record_time', 'CONTENT':'str_violate_content', 'VIOLATE_IMAGE_STATUS':'str_sexy_image_status', 'VIOLATE_IMAGE_URL':'str_sexy_image_url' } export_data = ExportData(source_table = 'LiveApp_anchor_info', aim_table = 'tab_nbsp_violate_anchor_info', key_map = key_map, unique_key = 'ANCHOR_ID', update_read_status = True, condition = {'violate_content' : {'$ne':''}, 'read_status':0}, unique_key_mapping_source_key = {'ANCHOR_ID':'int_room_id'}) export_data.export_to_oracle()
class CompareKeywords(): def __init__(self): self._oracledb = OracleDB() self._clues = self.get_clues() def get_clues(self): sql = 'select t.id clues_id,to_char(t.keyword2),to_char(t.keyword3),t.zero_id, FIRST_ID, second_id from TAB_IOPM_CLUES t where zero_id != 7' # 7 为传播途径 clues = self._oracledb.find(sql) return clues def get_contained_keys(self, text): ''' @summary: --------- @param text:比较的文本 @param keys:关键词列表 --------- @result: ''' keywords = [] clues_ids = [] zero_ids = [] first_ids = [] second_ids = [] keyword_clues = {} for clue in self._clues: clue_id = clue[0] key2 = clue[1] key3 = clue[2] zero_id = clue[3] first_id = clue[4] second_id = clue[5] keys = format_keywords(key2) # 格式化线索词 for key in keys: #['新闻节目', '总理&主席', 'the xi factor'] # 获取单元key 如 总理&主席 必须全包含 unit_keys = key.split('&') # [总理, 主席] for unit_key in unit_keys: if unit_key not in text: break else: keywords.extend(unit_keys) clues_ids.append(str(clue_id)) zero_ids.append(str(zero_id)) first_ids.append(str(first_id)) second_ids.append(str(second_id)) for unit_key in unit_keys: keyword_clues[unit_key] = clue_id return ','.join(set(keywords)), ','.join(set(clues_ids)), ','.join( set(zero_ids)), ','.join(set(first_ids)), ','.join( set(second_ids)), keyword_clues
def check_new_article(self, account): oralce_id, account_id, account_name, last_article_release_time, biz = account article_release_time = self._wechat_sogo.get_article_release_time( account_id=account_id, account=account_name) print(article_release_time) if article_release_time: last_article_release_time = last_article_release_time or '' if article_release_time >= tools.get_current_date( '%Y-%m-%d' ) and article_release_time > last_article_release_time: print('{} 有新文章发布,等待抓取。 发布时间:{}'.format(account_name, article_release_time)) sql = ''' update TAB_IOPM_SITE t set t.spider_status = 601, t.last_article_release_time = to_date('{}', 'yyyy-mm-dd hh24:mi:ss') where id = {} '''.format(article_release_time, oralce_id) # 多线程, 数据库需每个线程持有一个 oracledb = OracleDB() oracledb.update(sql) oracledb.close() # 入redis, 作为微信爬虫的任务池 data = (oralce_id, account_id, account_name, last_article_release_time, biz) self._redisdb.sadd('wechat:account', data)
class TaskService(): _task_ring_buff = RingBuff(TASK_BUFFER_SIZE) _offset = 1 _lock = threading.RLock() _db = OracleDB() def __init__(self): pass def load_task(self): task_sql = ''' select * from (select t.id, t.name, t.position, t.url, t.domain, rownum r from TAB_IOPM_SITE t where classify = 1 and t.mointor_status = 701 and t.position != 35 and rownum < {page_size}) where r >= {offset} '''.format(page_size=TaskService._offset + TASK_BUFFER_SIZE, offset=TaskService._offset) TaskService._offset += TASK_BUFFER_SIZE print(task_sql) tasks = TaskService._db.find(task_sql) if not tasks: TaskService._offset = 1 self.load_task() TaskService._task_ring_buff.put_data(tasks) def get_task(self, count=TASK_COUNT): TaskService._lock.acquire() #加锁 tasks = TaskService._task_ring_buff.get_data(count) if not tasks: self.load_task() tasks = TaskService._task_ring_buff.get_data(count) TaskService._lock.release() return tasks def update_task_status(self, tasks, status): TaskService._lock.acquire() #加锁 for task in tasks: website_id = task[0] sql = "update tab_iopm_site t set t.spider_time = to_date('%s', 'yyyy-mm-dd :hh24:mi:ss'), t.spider_status = %s where id = %s" % ( tools.get_current_date(), status, website_id) TaskService._db.update(sql) TaskService._lock.release()
def main(): oracledb = OracleDB() sql = 'select t.KEYWORD, t.monitor_type from TAB_MVMS_SEARCH_INFO t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time and search_type = 701' result_list = oracledb.find(sql) #[(keys, monitor_type),()] if not result_list: log.debug('无任务 结束') return # print(result_list) # keywords = [] # for result in result_list: # keywords.extend(result[0].split(',')) def begin_callback(): log.info('\n********** WWA_wechat_account begin **********') db = MongoDB() db.delete('WWA_wechat_account_url', {}) def end_callback(): log.info('\n********** WWA_wechat_account end **********') export_data.account_main() parser_params = {'result_list': result_list} # 配置spider spider = Spider(tab_urls='WWA_wechat_account_url', tab_site='WWA_wechat_site_info', tab_content='WWA_wechat_official_accounts', content_unique_key='account_id', parser_count=1, begin_callback=begin_callback, end_callback=end_callback, parser_params=parser_params) # 添加parser spider.add_parser(wechat_account_parser) spider.start()
def main(): oracledb = OracleDB() sql = 'select t.account_id, t.monitor_type from TAB_MVMS_WECHAT_INFO t where monitor_status = 402' result_list = oracledb.find(sql) if not result_list: log.debug('无任务 结束') return # keywords = [] # for result in result_list: # keywords.append(result[0]) def begin_callback(): log.info('\n********** WWA_wechat_article begin **********') db = MongoDB() db.delete('WWA_wechat_article_url', {}) def end_callback(): log.info('\n********** WWA_wechat_article end **********') export_data.article_main() parser_params = result_list # 配置spider spider = Spider(tab_urls='WWA_wechat_article_url', tab_site='WWA_wechat_site_info', tab_content='WWA_wechat_article', content_unique_key='title', parser_count=1, begin_callback=begin_callback, end_callback=end_callback, parser_params=parser_params) # 添加parser spider.add_parser(wechat_article_parser) spider.start()
def main(): db = OracleDB() sql = ''' select t.program_id, c.chan_name, program_name, d.name, t.image_url, t.official_blog from TAB_MMS_PROGRAM t left join tab_mam_chan c on c.chan_id = t.chan_id left join tab_mms_dictionary d on t.type = d.id and d.type = 2 ''' # where t.program_id = 226 program_info = db.find(sql) def begin_callback(): log.info('\n********** news begin **********') # 更新任务状态 doing def end_callback(): log.info('\n********** news end **********') # 配置spider spider = Spider(tab_urls='mms_urls', begin_callback=begin_callback, end_callback=end_callback, delete_tab_urls=True, parser_params=program_info) # 添加parser # spider.add_parser(iqiyi_hot_parser) spider.add_parser(iqiyi_search_parser) # spider.add_parser(weibo_user_parser) # spider.add_parser(weibo_article_parser) spider.start()
def main(): oracledb = OracleDB() esdb = ES() # sql = 'select MSG_ID from TAB_IOPM_USER_ACTION t where action_type=301 and msg_type = 502 and record_time>=sysdate-1' # article_ids = oracledb.find(sql) article_ids = [8888515, 8888293, 8891299] for article_id in article_ids: # article_id = article_id[0] body = {"WEIGHT": 0} print(article_id) esdb.update_by_id('tab_iopm_article_info', article_id, body)
class EventFilter(threading.Thread): def __init__(self): super(EventFilter, self).__init__() self._db = OracleDB() self._event_knowledges = self.load_event_knowledges() def run(self): while True: tools.delay_time(60 * 60) print('更新事件知识库...') self._event_knowledges = self.load_event_knowledges() print('更新事件知识库完毕') def load_event_knowledges(self): ''' @summary: 801 时事政治 802 社会民生 803 教育改革 804 医疗卫生 805 科技舆情 806 意识形态(无) 807 政策法规 808 经济舆情(无) 809 生态文明 810 体育舆情(无) 811 突发安全(无) --------- --------- @result: ''' sql = 'select t.keyword, t.type from TAB_IOPM_EVENT_KNOWLEDEGE t' event_knowledges = self._db.find(sql) return event_knowledges def find_contain_event(self, text): contain_event_type = set() for event in self._event_knowledges: event_keyword = event[0] event_type = event[1] if event_keyword in text: contain_event_type.add(str(event_type)) return list(contain_event_type)
def __init__(self): self._oracledb = OracleDB() self._clues = self.get_clues()
# -*- coding: utf-8 -*- ''' Created on 2017-07-26 19:04 --------- @summary: --------- @author: Boris ''' import sys sys.path.append('../') from db.oracledb import OracleDB import utils.tools as tools oracledb = OracleDB() def main(): url = 'http://192.168.60.38:8001/hotspot_al/interface/getHotAnalysis_self' json = tools.get_json_by_requests(url) # print(json) hot_list = [] datas = json['data'] for data in datas: clus_id = list(data.keys())[0] sql = 'select t.name from TAB_IOPM_CLUES t where id = ' + clus_id name = oracledb.find(sql)[0][0] hot_infos = data[clus_id]['data']
def __init__(self): self._es = ES() self._db = OracleDB() self._max_id = tools.read_file(STO_MAX_ID_FILE) self._max_id = self._max_id and eval(self._max_id) or {}
def __init__(self): super(EventFilter, self).__init__() self._db = OracleDB() self._event_knowledges = self.load_event_knowledges()
@author: Boris ''' import sys sys.path.append('..') import init import utils.tools as tools from utils.log import log from db.oracledb import OracleDB from base.wechat_public_platform import WechatPublicPlatform from base.wechat_sogou import WechatSogou if __name__ == '__main__': db = OracleDB() # wechat_public_platform = WechatPublicPlatform() wechat_sogou = WechatSogou() # 取微信号 # sql = 'select t.name, t.keyword2 from TAB_IOPM_CLUES t where t.zero_id = 7 and t.first_id = 137 and t.second_id = 183' # accounts = db.find(sql) accounts = ['骨朵网络影视'] for account in accounts: account_id = '' account_name = account biz = wechat_sogou.get_biz(account_id=account_id, account=account_name) if biz: sql = "insert into TAB_IOPM_SITE t (t.id, t.name, t.position, t.classify, t.mointor_status, t.biz, t.priority) values (seq_iopm_site.nextval, '{name}', 1, 2, 701, '{biz}', 1)".format( name=account_name, biz=biz) print(sql) db.add(sql)
def main(): db = OracleDB() # 查文章 sql = ''' select * from (select rownum r, id, title from tab_iopm_article_info where rownum >= 1) where r <= 100000 ''' articles = db.find(sql) # 查热点 sql = 'select id, title from tab_iopm_hot_info' hots = db.find(sql) for article in articles: max_similar = { 'similarity': 0, 'hot_id': -1, 'article_id': -1, 'hot_title': '' } # 最相似的文章 similarity表示相似度(0~1) article_id = article[1] article_text = article[2] for hot in hots: hot_id = hot[0] hot_text = hot[1] similarity = compare_text(hot_text, article_text) # print(''' # article_text %s # hot_text %s # similarity %s # '''%(article_text, hot_text, similarity)) if similarity > max_similar['similarity']: max_similar['similarity'] = similarity max_similar['hot_id'] = hot_id max_similar['article_id'] = article_id max_similar['hot_title'] = article_text if len(hot_text) > len( article_text) else hot_text if max_similar['similarity'] > SIMILARITY: sql = 'update tab_iopm_article_info set hot_id = %s where id = %s' % ( max_similar['hot_id'], max_similar['article_id']) db.update(sql) sql = "update tab_iopm_hot_info set hot = hot + 1, title = '%s' where id = %s" % ( max_similar['hot_title'], max_similar['hot_id']) db.update(sql) else: sql = 'select sequence.nextval from dual' hot_id = db.find(sql)[0][0] sql = "insert into tab_iopm_hot_info (id, title, hot) values (%s, '%s', 1)" % ( hot_id, article_text) db.add(sql) sql = 'update tab_iopm_article_info set hot_id = %s where id = %s' % ( hot_id, article_id) db.update(sql) sql = 'select id, title from tab_iopm_hot_info' hots = db.find(sql)
def add_anchor_info(table, site_id, title='', name='', image_url='', room_id='', room_url='', video_path='', watched_count='', fans_count='', sex='', age='', address='', live_view=1, watched_count_url=''): ''' @summary: --------- @param table: 表名 @param site_id: 网站id @param name: 主播名 @param image_url: 贴图地址 @param room_id: 房间号 @param room_url: 房间网页的url @param video_path: 房间视频流地址 @param watched_count: 观众数 @param fans_count: 粉丝数 @param sex: 性别 @param age: 年龄 @param address: 主播所在地址(城市) @param live_view: 直播状态(0 未直播 1 直播) @param watched_count_url: 实时观众数地址 --------- @result: ''' #违规知识库检索 task_id = 0 violate_content = '' #-交验-- from db.oracledb import OracleDB oracle_db = OracleDB() sql = 'select t.name, t.keyword, t.task_id from tab_nbsp_violate_knowledge t where t.monitor_start_time <= sysdate and sysdate <= t.monitor_end_time' results = oracle_db.find(sql) #[('色情低俗', '性感,枪支,格斗,脱衣,透视,胸器', 1)] for result in results: name_, keywords, task_id_ = result keywords = keywords.split(',') for keyword in keywords: if name.find(keyword) != -1: task_id = task_id_ violate_content = name anchor_info_dict = { 'site_id': site_id, 'title': title, 'task_id': task_id, 'violate_content': violate_content, 'name': name, 'image_url': image_url, 'sex': sex, 'age': age, 'address': address, 'fans_count': fans_count, 'watched_count': watched_count, 'room_id': room_id, 'room_url': room_url, 'video_path': video_path, 'live_view': live_view, 'record_time': tools.get_current_date(), 'watched_count_url': watched_count_url, 'read_status': 0, 'sexy_image_status': '', 'sexy_image_url': '', 'image_pron_status': 0 } if not db.add(table, anchor_info_dict): anchor_info_dict.pop('_id') anchor_info_dict.pop('sexy_image_status') anchor_info_dict.pop('sexy_image_url') anchor_info_dict.pop('image_pron_status') db.update(table, {'room_id': room_id}, anchor_info_dict)
@author: Boris ''' from cluster.compare_text import compare_text from db.oracledb import OracleDB import utils.tools as tools SIMILARITY = 0.45 # 相似度 聚类阈值 相似度大于 n 就算一类 0<=n<=1 CLUSTER_BUFFER_ZISE = 100 PAGE_SIZE = 1000 cluster_buffer = { # "hot_id":{'title':'xxxx', 'article_ids':[1,2,3,4], 'article_count':0}, # "hot_id":{'title':'xxxx', 'article_ids':[1,2,3,4], 'article_count':0} } db = OracleDB() def deal_cluster_buffer(): global cluster_buffer cluster_buffer_tota_count = len(cluster_buffer) cluster_buffer_deal_count = 0 for hot_id, data in cluster_buffer.items(): tools.print_loading( '缓存到达最大限制 正在向数据库中写数据 %d/%d' % (cluster_buffer_deal_count, cluster_buffer_tota_count)) article_ids = data['article_ids'] article_count = data['article_count'] hot_title = data['title'].replace("'", "''") # 更新线索对应的热点id
def main(): search_task_sleep_time = int( tools.get_conf_value('config.conf', 'task', 'search_task_sleep_time')) db = OracleDB() # 更新符合日期条件的任务状态 未做 sql = 'update tab_ivms_task_info t set t.task_status = 501 where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time' db.update(sql) # 更新关键词状态 未做 sql = 'update tab_ivms_task_keyword k set k.finish_status = 601 where k.task_id in (select t.task_id from tab_ivms_task_info t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time)' db.update(sql) while True: # 查任务 log.debug('查询任务...') sql = 'select t.task_id from TAB_IVMS_TASK_INFO t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time and t.task_status = 501' result = db.find(sql, fetch_one=True) if not result: break task_id = result[0] while True: # 查看是否有正在执行的任务 sql = 'select t.* from TAB_IVMS_TASK_KEYWORD t where t.task_id = %d and finish_status = 602' % task_id do_task = db.find(sql, fetch_one=True) if do_task: time.sleep(search_task_sleep_time) continue sql = 'select t.* from TAB_IVMS_TASK_KEYWORD t where t.task_id = %d and finish_status = 601' % task_id result = db.find(sql, fetch_one=True) if not result: break keyword_id = result[0] task_id = result[1] search_keyword1 = [] search_keyword2 = result[2].split(',') if result[2] else [] search_keyword3 = result[3].split(',') if result[3] else [] def begin_callback(): log.info('\n********** VA begin **********') # 更新任务状态 正在做 sql = 'update TAB_IVMS_TASK_INFO set task_status = 502 where task_id = %d' % task_id db.update(sql) # 更新关键词状态 正在做 sql = 'update tab_ivms_task_keyword set finish_status = 602 where id = %d' % keyword_id db.update(sql) def end_callback(): # 更新关键词状态 做完 sql = 'update tab_ivms_task_keyword set finish_status = 603 where id = %d' % keyword_id db.update(sql) # 如果该任务的所有关键词都做完 则更新任务状态为做完 sql = 'select t.* from tab_ivms_task_keyword t where task_id = %d and finish_status = 601' % task_id results = db.find(sql) if not results: # 导出数据 key_map = { 'program_id': 'vint_sequence.nextval', 'search_type': 'int_search_type', 'program_name': 'str_title', 'program_url': 'str_url', 'release_date': 'date_release_time', 'image_url': 'str_image_url', 'program_content': 'str_content', 'task_id': 'vint_%d' % task_id, 'keyword': 'str_keyword', 'keyword_count': 'int_keyword_count', 'check_status': 'vint_202' } export = ExportData('VA_content_info', 'tab_ivms_program_info', key_map, 'program_url') export.export_to_oracle() # 更新任务状态 做完 sql = 'update TAB_IVMS_TASK_INFO set task_status = 503 where task_id = %d' % task_id db.update(sql) log.info('\n********** VA end **********') # 配置spider spider = Spider(tab_urls='VA_urls', tab_site='VA_site_info', tab_content='VA_content_info', parser_count=1, begin_callback=begin_callback, end_callback=end_callback, search_keyword1=search_keyword1, search_keyword2=search_keyword2, search_keyword3=search_keyword3) # 添加parser spider.add_parser(baidu_parser) spider.add_parser(magnet_parser) spider.add_parser(netdisk_parser) spider.add_parser(weibo_parser) spider.add_parser(wechat_parser) spider.add_parser(soubaidupan_parser) spider.add_parser(douban_parser) spider.start() time.sleep(search_task_sleep_time)
def add_WWA_search_app_info(table, site_id, url, title='', summary='', update_info='', score='', author='', app_url='', image_url='', software_size='', tag='', platform='', download_count='', release_time='', language='', sensitive_id='', read_status=0): ''' @summary: --------- @param title: 标题 @param site_id: 网站id @param summary: 简介 @param update_info: 更新信息 @param socre: 评分 @param author: 作者 @param url: 原文url @param app_url: app下载的url @param image_url : 图片url(多个url逗号分割) @param classify_id: 分类 @param software_size: 大小 @param tag: 版本 | @param platform: 平台(ios / android) @param download_count:下载次数 @param release_time: 发布时间 @param record_time: 记录时间 @param sensitive_id: varchar|||敏感信息id(多个敏感信息id用逗号分割) @param read_status: 读取状态(0没读, 1读取) --------- @result: ''' # 过滤掉不符合的app from db.oracledb import OracleDB oracle_db = OracleDB() sql = 'select keyword from TAB_MVMS_SEARCH_INFO t where search_type = 703' results = oracle_db.find(sql) #[('天天快报,今日头条,黑龙江',)] is_usefull = False text_content = title + summary + update_info + author for result in results: keywords = result[0] keywords = keywords.split(',') for keyword in keywords: if keyword in text_content: is_usefull = True break if is_usefull: break if not is_usefull: return if language == '中文': language = 601 elif language == '英文': language = 602 else: language = 603 title = tools.del_html_tag(title) gameApp_info_dict = { 'site_id': site_id, 'url': url, 'summary': tools.del_html_tag(summary, except_line_break=True), 'title': title, 'update_info': tools.del_html_tag(update_info, except_line_break=True), 'score': score, 'author': author, 'app_url': app_url, 'image_url': image_url, 'software_size': software_size, 'tag': tag, 'platform': platform, 'download_count': download_count, 'release_time': release_time, 'record_time': tools.get_current_date(), 'language': language, 'sensitive_id': sensitive_id, 'read_status': 0, 'sexy_image_status': '', 'sexy_image_url': '', 'image_pron_status': 0 } db.add(table, gameApp_info_dict)