def run(self): try: urls_article = [ 'https://mp.weixin.qq.com/s?src=11×tamp=1541559601&ver=1229&signature=ixTsG-RvK8H58t6D-CpW6olWI8hA52Wz-FRb12ZcrNG-lxR20YutoyLYUr-RB3w8WHjE1petjDcbbxZVxTChvPWM27qszWu0Z3zonjx8SEQB5mmgm1O9Eu*5qsFhnBCH&new=1' ] entity = None backpack_list = [] ftp_list = [] ftp_info = None for page_count, url in enumerate(urls_article): # if page_count < 15: # continue html = requests.get(url) # 确定account信息 name = pq(html.text)('#js_name').text() account_name = pq( html.text)('.profile_meta_value').eq(0).text() log('---{}---{}---'.format(name, account_name)) account = Account() account.name = name account.account = account_name account.get_account_id() article = Article() try: article.create(url, account) except RuntimeError as run_error: log('找不到浏览器 {}'.format(run_error)) log('第{}条 文章标题: {}'.format(page_count, article.title)) log("当前文章url: {}".format(url)) entity = JsonEntity(article, account) log('当前文章ID: {}'.format(entity.id)) # if entity.id in ids: # log('当前文章已存在,跳过') # continue backpack = Backpack() backpack.create(entity) backpack_list.append(backpack.create_backpack()) # self.save_to_mysql(entity) self.save_to_mongo(entity.to_dict()) # ftp包 ftp_info = Ftp(entity) name_xml = ftp_info.hash_md5(ftp_info.url) log('当前文章xml: {}'.format(name_xml)) self.create_xml(ftp_info.ftp_dict(), name_xml) ftp_list.append(name_xml) # if page_count >= 3: # break log("发包") # todo 发包超时,修改MTU if ftp_info is not None: entity.uploads_ftp(ftp_info, ftp_list) if entity: # entity.uploads(backpack_list) entity.uploads_datacenter_relay(backpack_list) entity.uploads_datacenter_unity(backpack_list) log("发包完成") except Exception as e: log("解析公众号错误 {}".format(e)) if 'chrome not reachable' in str(e): raise RuntimeError('chrome not reachable')
def run(self): count = 0 while True: count += 1 log.info('第{}次'.format(count)) account_list = ADD_COLLECTION if ADD_COLLECTION else self.account_list( ) # if account_list: # continue # for account_name in account_list: try: for account_name in account_list: log.info('第{}次'.format(count)) self.search_name = account_name html_account = self.account_homepage() if html_account: html = html_account else: log.info('找到不到微信号首页: '.format(account_name)) continue urls_article = self.urls_article(html) # 确定account信息 account = Account() account.name = self.name account.account = account_name account.get_account_id() # 判重 ids = self.dedup(account_name) entity = None backpack_list = [] ftp_list = [] ftp_info = None for page_count, url in enumerate(urls_article): # if page_count < 15: # continue article = Article() try: article.create(url, account) except RuntimeError as run_error: log.info('找不到浏览器 {}'.format(run_error)) log.info('第{}条 文章标题: {}'.format( page_count, article.title)) log.info("当前文章url: {}".format(url)) entity = JsonEntity(article, account) log.info('当前文章ID: {}'.format(entity.id)) if entity.id in ids: log.info('当前文章已存在,跳过') continue backpack = Backpack() backpack.create(entity) backpack_list.append(backpack.create_backpack()) # self.save_to_mysql(entity) self.save_to_mongo(entity.to_dict()) # ftp包 ftp_info = Ftp(entity) name_xml = ftp_info.hash_md5(ftp_info.url) log.info('当前文章xml: {}'.format(name_xml)) self.create_xml(ftp_info.ftp_dict(), name_xml) ftp_list.append(name_xml) # if page_count >= 3: # break log.info("发包") # todo 发包超时,修改MTU if ftp_info is not None: entity.uploads_ftp(ftp_info, ftp_list) if entity: # entity.uploads(backpack_list) entity.uploads_datacenter_relay(backpack_list) entity.uploads_datacenter_unity(backpack_list) log.info("发包完成") except Exception as e: log.exception("解析公众号错误 {}".format(e)) if 'chrome not reachable' in str(e): raise RuntimeError('chrome not reachable') continue