def run(self): try: urls_article = [ 'https://mp.weixin.qq.com/s?src=11×tamp=1541559601&ver=1229&signature=ixTsG-RvK8H58t6D-CpW6olWI8hA52Wz-FRb12ZcrNG-lxR20YutoyLYUr-RB3w8WHjE1petjDcbbxZVxTChvPWM27qszWu0Z3zonjx8SEQB5mmgm1O9Eu*5qsFhnBCH&new=1' ] entity = None backpack_list = [] ftp_list = [] ftp_info = None for page_count, url in enumerate(urls_article): # if page_count < 15: # continue html = requests.get(url) # 确定account信息 name = pq(html.text)('#js_name').text() account_name = pq( html.text)('.profile_meta_value').eq(0).text() log('---{}---{}---'.format(name, account_name)) account = Account() account.name = name account.account = account_name account.get_account_id() article = Article() try: article.create(url, account) except RuntimeError as run_error: log('找不到浏览器 {}'.format(run_error)) log('第{}条 文章标题: {}'.format(page_count, article.title)) log("当前文章url: {}".format(url)) entity = JsonEntity(article, account) log('当前文章ID: {}'.format(entity.id)) # if entity.id in ids: # log('当前文章已存在,跳过') # continue backpack = Backpack() backpack.create(entity) backpack_list.append(backpack.create_backpack()) # self.save_to_mysql(entity) self.save_to_mongo(entity.to_dict()) # ftp包 ftp_info = Ftp(entity) name_xml = ftp_info.hash_md5(ftp_info.url) log('当前文章xml: {}'.format(name_xml)) self.create_xml(ftp_info.ftp_dict(), name_xml) ftp_list.append(name_xml) # if page_count >= 3: # break log("发包") # todo 发包超时,修改MTU if ftp_info is not None: entity.uploads_ftp(ftp_info, ftp_list) if entity: # entity.uploads(backpack_list) entity.uploads_datacenter_relay(backpack_list) entity.uploads_datacenter_unity(backpack_list) log("发包完成") except Exception as e: log("解析公众号错误 {}".format(e)) if 'chrome not reachable' in str(e): raise RuntimeError('chrome not reachable')
def run(self): # self.set_name() # while True: account_list = ['有看投',] entity = None backpack_list = [] for name in account_list: self.name = name html_account = self.account_homepage() if html_account: html, account_of_homepage = html_account else: continue log('start 公众号: ', self.name) urls_article = self.urls_article(html) account = Account() account.name = self.name account.account = account_of_homepage account.get_account_id() # account.account_id = 126774646 for page_count, url in enumerate(urls_article): # if page_count < 35: # continue article = Article() article.create(url, account) log('文章标题:', article.title) log("第{}条".format(page_count)) entity = JsonEntity(article, account) backpack = Backpack() backpack.create(entity) backpack_list.append(backpack.create_backpack()) import pymongo conn = pymongo.MongoClient('mongo') # 上传数据库 sql = ''' INSERT INTO account_http(article_url, addon, account, account_id, author, id, title) VALUES (%s, %s, %s, %s, %s, %s, %s) ''' _tuple = ( article.url, datetime.datetime.now(), entity.account, entity.account_id, entity.author, entity.id, entity.title ) uploads_mysql(config_mysql, sql, _tuple) if page_count == 4: break log("发包") if entity: # entity.uploads(backpack_list) # entity.uploads_datacenter_relay(backpack_list) entity.uploads_datacenter_unity(backpack_list) print('end')
def run(self): count = 0 while True: # ADD_COLLECTION 补采账号 get_account 日常采集; 使用account_list 兼容单个账号和账号列表 account_list = ADD_COLLECTION if ADD_COLLECTION else self.get_account( ) # length = len(threading.enumerate()) # 枚举返回个列表 # log.info('当前运行的线程数为:{}'.format(threading.active_count())) count += 1 log.info('第{}次'.format(count)) if account_list is None: log.info('调度队列为空,休眠5秒') time.sleep(5) continue for account_name in account_list: try: self.search_name = account_name html_account = self.account_homepage() if html_account: html = html_account else: log.info('{}|找到不到微信号'.format(account_name)) continue urls_article = self.urls_article(html) # 确定account信息 account = Account() account.name = self.name account.account = account_name account.tags = self.get_tags() account.get_account_id() # 判重 ids = self.dedup(account_name) if JUDEG else '' entity = None backpack_list = [] ftp_list = [] ftp_info = None for page_count, url in enumerate(urls_article): # if page_count < 15: # continue article = Article() try: article.create(url, account) except RuntimeError as run_error: log.info('微信验证码错误 {}'.format(run_error)) log.info('第{}条 文章标题: {}'.format( page_count, article.title)) log.info("当前文章url: {}".format(url)) entity = JsonEntity(article, account) log.info('当前文章ID: {}'.format(entity.id)) if entity.id in ids and JUDEG is True: log.info('当前文章已存在,跳过') # if page_count >= 20: # log.info('超过20篇文章,跳出') # break continue backpack = Backpack() backpack.create(entity) backpack_list.append(backpack.create_backpack()) # self.save_to_mysql(entity) # self.save_to_mongo(entity.to_dict()) # ftp包 ftp_info = Ftp(entity) name_xml = ftp_info.hash_md5(ftp_info.url) log.info('当前文章xml: {}'.format(name_xml)) self.create_xml(ftp_info.ftp_dict(), name_xml) ftp_list.append(name_xml) # break log.info("开始发包") # todo 发包超时,修改MTU if ftp_info is not None: entity.uploads_ftp(ftp_info, ftp_list) log.info("ftp发包完成") if entity and backpack_list: # entity.uploads(backpack_list) entity.uploads_datacenter_relay(backpack_list) entity.uploads_datacenter_unity(backpack_list) log.info("数据中心,三合一,发包完成") except Exception as e: log.exception("解析公众号错误 {}".format(e)) if 'chrome not reachable' in str(e): raise RuntimeError('chrome not reachable') if ADD_COLLECTION: break
def run(self): count = 0 while True: # ADD_COLLECTION 补采账号 get_account 日常采集; 使用account_list 兼容单个账号和账号列表 account_list = ADD_COLLECTION if ADD_COLLECTION else self.get_account( ) # length = len(threading.enumerate()) # 枚举返回个列表 log.info('当前运行的线程数为:{}'.format(threading.active_count())) log.info('当前运行的进程:{}'.format( multiprocessing.current_process().name)) count += 1 log.info('第{}次'.format(count)) if account_list is None: log.info('调度队列为空,休眠5秒') time.sleep(5) continue for account_name in account_list: try: self.search_name = account_name html_account = self.account_homepage() if html_account: html = html_account else: log.info('{}|找到不到微信号'.format(account_name)) continue urls_article = self.urls_article(html) # 确定account信息 account = Account() account.name = self.name account.account = account_name account.tags = self.get_tags() account.get_account_id() # 判重 查底层 # ids = self.dedup(account_name) if JUDEG else '' # 判重 redis sentenced_keys = account.account + ' ' + str( account.account_id) keys = hash_md5(sentenced_keys) log.info('keys: {}'.format(keys)) dedup_result = self.dedup_redis(keys) post_dedup_urls = [] entity = None backpack_list = [] ftp_list = [] ftp_info = None for page_count, url in enumerate(urls_article): try: # if page_count > 5: # break article = Article() article.create(url, account, self.proxies) log.info('第{}条 文章标题: {}'.format( page_count, article.title)) log.info("当前文章url: {}".format(url)) entity = JsonEntity(article, account) log.info('当前文章ID: {}'.format(entity.id)) article_date = datetime.datetime.fromtimestamp( int(str(article.time)[:-3])) day_diff = datetime.date.today( ) - article_date.date() if day_diff.days > 15: log.info( '超过采集interval最大15天 的文章不采集,已采集{}条文章'.format( page_count)) self.count_articles(page_count) break if dedup_result: # title_time_str = entity.title + str(entity.time) # title_time_md5 = hash_md5(title_time_str) if entity.id in dedup_result: log.info('当前文章已存在,跳过') continue else: post_dedup_urls.append(entity.id) else: # title_time_str = entity.title + str(entity.time) # title_time_md5 = hash_md5(title_time_str) post_dedup_urls.append(entity.id) # dedup_result = self.dedup_redis(entity) # if dedup_result: # log.info('当前文章已存在,跳过') # ids = ids.append({'key': entity.id, 'urls': entity.url}) # if entity.id in ids and JUDEG is True: # log.info('当前文章已存在,跳过') # continue backpack = Backpack() backpack.create(entity) backpack_list.append(backpack.create_backpack()) # self.save_to_mysql(entity) # self.save_to_mongo(entity.to_dict()) # ftp包 ftp_info = Ftp(entity) name_xml = ftp_info.hash_md5(ftp_info.url) log.info('当前文章xml: {}'.format(name_xml)) self.create_xml(ftp_info.ftp_dict(), name_xml) ftp_list.append(name_xml) except Exception as run_error: log.info('微信解析文章错误 {}'.format(run_error)) continue log.info("开始发包") if entity and backpack_list: # 直接发底层 # entity.uploads(backpack_list) entity.uploads_datacenter_relay(backpack_list) entity.uploads_datacenter_unity(backpack_list) log.info("数据中心,三合一,发包完成") else: log.info('包列表为空,不发送数据') continue # todo 发包超时,修改MTU if ftp_info is not None: entity.uploads_ftp(ftp_info, ftp_list) log.info("ftp发包完成") if post_dedup_urls: log.info('上传判重中心key:{} urls:{}'.format( keys, post_dedup_urls)) url = 'http://47.100.53.87:8008/Schedule/CacheWx' data = [{ "key": keys, "sourceNodes": "1", "sourceType": "2", "urls": post_dedup_urls }] r = requests.post(url, data=json.dumps(data), timeout=self.timeout) log.info('上传判重中心结果{}'.format(r.status_code)) except Exception as e: log.exception("解析公众号错误 {}".format(e)) time.sleep(30) if ('chrome not reachable' in str(e)) or ('Message: timeout' in str(e)): raise RuntimeError('chrome not reachable') if ADD_COLLECTION: break
def run(self): count = 0 while True: count += 1 log.info('第{}次'.format(count)) account_list = ADD_COLLECTION if ADD_COLLECTION else self.account_list( ) # if account_list: # continue # for account_name in account_list: try: for account_name in account_list: log.info('第{}次'.format(count)) self.search_name = account_name html_account = self.account_homepage() if html_account: html = html_account else: log.info('找到不到微信号首页: '.format(account_name)) continue urls_article = self.urls_article(html) # 确定account信息 account = Account() account.name = self.name account.account = account_name account.get_account_id() # 判重 ids = self.dedup(account_name) entity = None backpack_list = [] ftp_list = [] ftp_info = None for page_count, url in enumerate(urls_article): # if page_count < 15: # continue article = Article() try: article.create(url, account) except RuntimeError as run_error: log.info('找不到浏览器 {}'.format(run_error)) log.info('第{}条 文章标题: {}'.format( page_count, article.title)) log.info("当前文章url: {}".format(url)) entity = JsonEntity(article, account) log.info('当前文章ID: {}'.format(entity.id)) if entity.id in ids: log.info('当前文章已存在,跳过') continue backpack = Backpack() backpack.create(entity) backpack_list.append(backpack.create_backpack()) # self.save_to_mysql(entity) self.save_to_mongo(entity.to_dict()) # ftp包 ftp_info = Ftp(entity) name_xml = ftp_info.hash_md5(ftp_info.url) log.info('当前文章xml: {}'.format(name_xml)) self.create_xml(ftp_info.ftp_dict(), name_xml) ftp_list.append(name_xml) # if page_count >= 3: # break log.info("发包") # todo 发包超时,修改MTU if ftp_info is not None: entity.uploads_ftp(ftp_info, ftp_list) if entity: # entity.uploads(backpack_list) entity.uploads_datacenter_relay(backpack_list) entity.uploads_datacenter_unity(backpack_list) log.info("发包完成") except Exception as e: log.exception("解析公众号错误 {}".format(e)) if 'chrome not reachable' in str(e): raise RuntimeError('chrome not reachable') continue
def run(self): count = 0 while True: count += 1 log.info('第{}次'.format(count)) # ADD_COLLECTION 补采账号 get_account 日常采集; 使用account_list 兼容单个账号和账号列表 account_list = ADD_COLLECTION if ADD_COLLECTION else self.get_account() if account_list is None: log.info('调度队列为空,休眠5秒') time.sleep(5) for account_name in account_list: try: self.search_name = account_name html_account = self.account_homepage() if html_account: html = html_account else: log.info('{}|找到不到微信号'.format(account_name)) continue urls_article = self.urls_article(html) # 确定account信息 account = Account() account.name = self.name account.account = account_name account.tags = self.get_tags() account.get_account_id() if not account.account_id: log.info('没有account_id'.format(self.name)) break # 判重 ids = self.dedup(account_name) if JUDEG else '' entity = None backpack_list = [] ftp_list = [] ftp_info = None for page_count, url in enumerate(urls_article): # if page_count < 15: # continue article = Article() try: article.create(url, account) except RuntimeError as run_error: log.info('找不到浏览器 {}'.format(run_error)) log.info('第{}条 文章标题: {}'.format(page_count, article.title)) log.info("当前文章url: {}".format(url)) entity = JsonEntity(article, account) log.info('当前文章ID: {}'.format(entity.id)) if entity.id in ids and JUDEG is True: log.info('当前文章已存在,跳过0') # continue backpack = Backpack() backpack.create(entity) backpack_list.append(backpack.create_backpack()) # self.save_to_mysql(entity) # self.save_to_mongo(entity.to_dict()) # if page_count >= 3: # break log.info("开始发包") if entity and backpack_list: entity.uploads(backpack_list) log.info("发包完成") except Exception as e: log.exception("解析公众号错误 {}".format(e)) if 'chrome not reachable' in str(e): raise RuntimeError('chrome not reachable')
def run(self): html_account = self.account_homepage() if html_account: html, account_of_homepage = html_account else: self.send_result() return log('start 公众号: ', self.name) urls_article = self.urls_article(html) account = Account() account.name = self.name account.account = account_of_homepage account.get_account_id() articles = [] backpack_list = [] positive_article = 0 nagetive_article = 0 for page_count, url in enumerate(urls_article): # if page_count > 2: # break article = Article() log('url:', url) article.create(url, self.name) log('第{}条, 文章标题: {}'.format(page_count, article.title)) # 超过7天不管 if article.time: article_date = datetime.datetime.fromtimestamp( int(article.time[:-3])) day_diff = datetime.datetime.now().date() - article_date.date() if day_diff.days > 6: break # 统计文章正负面 count_positive, count_nagetive = self.emotion_judge( article.content) if count_positive > count_nagetive: positive_article += 1 else: nagetive_article += 1 entity = JsonEntity(article, account) backpack = Backpack() backpack.create(entity) backpack_list.append(backpack.create_backpack()) # 所有文章 article_info = backpack.to_dict() articles.append(article_info) log('所有文章抓取完毕') content_all_list = '' for article in articles: content_all_list += article.get('Content') log('文章长度', len(content_all_list)) # 分词处理 key_words_list = [] GETNER_API_URL = 'http://221.204.232.7:40015/NER/GetNer' data = { "texts": [content_all_list], } log('请求分词') response = requests.post(url=GETNER_API_URL, data=data, timeout=180) ner_result = response.json().get('rst')[0] if ner_result.get('status') == 'success': org_dic = ner_result.get('ner').get('ORG') loc_dic = ner_result.get('ner').get('LOC') per_dic = ner_result.get('ner').get('PER') if org_dic: for i in org_dic.items(): key_words_list.append(i) if loc_dic: for i in loc_dic.items(): key_words_list.append(i) if per_dic: for i in per_dic.items(): key_words_list.append(i) # 返回前20个出现频率最高的词 key_words = dict() key_words['list'] = [] key_words_list = sorted(key_words_list, key=lambda x: x[1], reverse=True)[:21] for k in key_words_list: key_words['list'].append({"times": k[1], "keyword": k[0]}) log('分词完成') # 处理文章 result = handle(articles) result['KeyWord'] = key_words result['ArtPosNeg'] = { 'Indicate': { 'Positive': positive_article, 'Negative': nagetive_article } } result['Success'] = True result['Account'] = self.name result['Message'] = '' db['newMedia'].update({'Account': self.name}, {'$set': { 'data': result }}) log('{} 抓取完成'.format(self.name)) # 向前端发送成功请求 self.status = 3 self.send_result()
def run(self): html_account = self.account_homepage() if html_account: html, account_of_homepage = html_account else: # self.send_result() return log('start 公众号: ', self.name) urls_article = self.urls_article(html) account = Account() account.name = self.name account.account = account_of_homepage account.get_account_id() articles = [] backpack_list = [] positive_article = 0 nagetive_article = 0 for page_count, url in enumerate(urls_article): # if page_count > 2: # break article = Article() log('url:', url) article.create(url, self.name) log('文章标题:', article.title) log("第{}条".format(page_count)) # 超过7天不管 if article.time: article_date = datetime.datetime.fromtimestamp( int(article.time[:-3])) day_diff = datetime.datetime.now().date() - article_date.date() if day_diff.days > 6: break # 统计文章正负面 count_positive, count_nagetive = self.emotion_judge( article.content) if count_positive > count_nagetive: positive_article += 1 else: nagetive_article += 1 entity = JsonEntity(article, account) backpack = Backpack() backpack.create(entity) backpack_list.append(backpack.create_backpack()) # 所有文章 article_info = backpack.to_dict() articles.append(article_info) log('所有文章抓取完毕') content_all_list = '' for article in articles: content_all_list += article.get('Content') # 分词处理 key_words_list = [] thu1 = thulac.thulac() seg_list = thu1.cut(''.join(content_all_list), text=False) for s in seg_list: if (len(s[0]) >= 2 and re.search('[\u4e00-\u9fff]+', s[0]) and s[1] in ['n', 'np', 'ns', 'ni', 'nz']): key_words_list.append(s[0]) # 返回前20个出现频率最高的词 key_words_counter = Counter(key_words_list).most_common(20) key_word = dict() key_word['list'] = [] for k in key_words_counter: key_word['list'].append({"times": k[1], "keyword": k[0]}) # 处理文章 result = handle(articles) result['KeyWord'] = key_word result['ArtPosNeg'] = { 'Indicate': { 'Positive': positive_article, 'Negative': nagetive_article } } result['Success'] = True result['Account'] = self.name result['Message'] = '' db['newMedia'].update({'Account': self.name}, {'$set': { 'data': result }}) log('{} 抓取完成'.format(self.name)) # 向前端发送成功请求 self.status = 3