def getArticleList(self, num=None): """ :param num: 如果是具体数字则 准备最近发布的num篇文章 :return: 根据公众号的昵称准备该公众号的所有或者前n篇文章的全部数据 如果某些字段没有就使用默认值 """ from pymongo import DESCENDING article_list = [] col = DB(self.ori_nickname) if num: article_datas = col.table.find().sort('p_date', DESCENDING)()[:num] else: article_datas = col.get() begin_time = time.time() for doc in article_datas: item = {} doc['id'] = doc['content_url'] for key in l1llll1lll_wcplus_: if key in doc: item[key] = doc[key] else: item[key] = -2 article_list.append(item) logger.info('解析文章文本用时 %.3f' % (time.time() - begin_time)) return article_list
def __init__(self): self.l1l11ll1l_wcplus_ = rd.articles_detail() self.nickname = self.l1l11ll1l_wcplus_[0]['nickname'] self.l11ll1l1ll_wcplus_ = 3.0 self.l11ll1l111_wcplus_ = len(self.l1l11ll1l_wcplus_) self.delay = round(self.l11ll1l1ll_wcplus_ / self.l11ll1l111_wcplus_, 3) self.l11lll11l_wcplus_ = [] self.l11ll111l_wcplus_ = DB(self.nickname) self.l11ll1lll1_wcplus_ = time.time()
def __init__(self): self.articles_detail = rd.articles_detail() self.nickname = self.articles_detail[0]['nickname'] self.delay_criterion = 2.0 self.length = len(self.articles_detail) self.delay = round(self.delay_criterion / self.length, 3) self.length = 0 self.length = 0 self.data = [] self.nickname_table = DB(self.nickname) self.time_now = time.time()
def l1l1llll1ll_wcplus_(nickname, md5): """ :param filename: :return: 返回一个静动态文件 """ from cmp.db.mongodb import DB if DB(nickname).count(id=md5, comment_id={'$exists': True}): from webbrowser import open import os if os_version == 'win': file_name = os.getcwd( ) + '\\\\web_server\\\\static\\\\html\\\\' + nickname + '\\' + md5 + '.html' if os.path.isfile(file_name): open(file_name) else: return '找不到该文章 可能是没有迁移到新版本的WCplus 请先从旧版本的WCplus中复制或移动到新版本的WCplus的web_server/static/html目录下' else: file_name = os.getcwd( ) + '/web_server/static/html/' + nickname + '/' + md5 + '.html' if os.path.isfile(file_name): open('file://' + file_name) else: return '找不到该文章 可能是没有迁移到新版本的WCplus 请先从旧版本的WCplus中复制或移动到新版本的WCplus的web_server/static/html目录下' return ('', 204) return '未保存该文章 请先采集'
def l1ll1111l_wcplus_(_1l1l1ll1l_wcplus_, l1ll11lll_wcplus_=128, process=None): global l11ll111l_wcplus_ global l1l1l1111l_wcplus_ global l1ll1l11ll_wcplus_ global nickname l1ll1l11ll_wcplus_ = process l1l1l1111l_wcplus_ = [] nickname = _1l1l1ll1l_wcplus_ l11ll111l_wcplus_ = DB(nickname) rc = l1ll11lll1_wcplus_() l1l1l1lll1_wcplus_ = rc.l1l1l1l11l_wcplus_( nickname, l1ll11lll_wcplus_=l1ll11lll_wcplus_, l1l1111ll1_wcplus_=1, l1l1llll11_wcplus_=l1l11lll11_wcplus_()) if not l1l1l1lll1_wcplus_: return rc.l1ll1l11l1_wcplus_() rc.l1l111ll11_wcplus_() l1l1ll1l11_wcplus_.l1l11ll11l_wcplus_()
def l1l1ll1ll_wcplus_(l1ll11lll_wcplus_=128, process=None): global l11ll111l_wcplus_ global l1l1l1111l_wcplus_ global l1ll1l11ll_wcplus_ global nickname l1ll1l11ll_wcplus_ = process l1l1l1111l_wcplus_ = [] from instance import rd nickname = rd.articles_detail()[0]['nickname'] l11ll111l_wcplus_ = DB(nickname) rc = l1ll11lll1_wcplus_() l1l1l1lll1_wcplus_ = rc.l1l1l1l11l_wcplus_( nickname, l1ll11lll_wcplus_=l1ll11lll_wcplus_, l1l1111ll1_wcplus_=1, l1l1llll11_wcplus_=l1l11lll11_wcplus_()) if not l1l1l1lll1_wcplus_: return rc.l1ll1l11l1_wcplus_() rc.l1l111ll11_wcplus_() l1l1ll1l11_wcplus_.l1l11ll11l_wcplus_()
# uncompyle6 version 3.2.6 # Python bytecode 3.6 (3379) # Decompiled from: Python 3.6.6 (default, Mar 29 2019, 00:03:27) # [GCC 4.8.5 20150623 (Red Hat 4.8.5-36)] # Embedded file name: instance\__init__.py """ 定义全局对象 """ from cmp.db.mongodb import DB crawler_log_table_instance = DB('crawler_log') req_data_table_instance = DB('req_data') from app.crawl_manag.params_handle import HandleParam rd = HandleParam() from app.api.settings import l11l111ll_wcplus_ l1l1111ll_wcplus_ = l11l111ll_wcplus_() from app.crawl_manag import CrawlManage crawler_manager = CrawlManage() from utils.base import osVersion os_version = osVersion()
# uncompyle6 version 3.2.6 # Python bytecode 3.6 (3379) # Decompiled from: Python 3.6.6 (default, Mar 29 2019, 00:03:27) # [GCC 4.8.5 20150623 (Red Hat 4.8.5-36)] # Embedded file name: app\api\settings.py """ 提供数据设置API """ from cmp.db.mongodb import DB setting_table_instance = DB('settings') class l11l111ll_wcplus_: def __int__(self): pass def get(self): """ :return: 获取所有的设置字段{} """ sd = setting_table_instance.get() datas_dict = {} for s in sd: datas_dict[s['key']] = s['value'] from cmp.protect import Passport from utils.network import getLocalIp datas_dict['uuid'] = Passport.getUUid() passport_expire_time = Passport.examplePassport() if not passport_expire_time:
def l111l1l1l_wcplus_(self): l111l11ll_wcplus_ = DB(self.nickname).get() return l111l11ll_wcplus_
class CrawlHistoryAricleList: """ 优雅地拿下一个公众号的全部历史文章列表 如果有必要直接调用自动操作手机的方法 采集完毕之后结束对象的生命周期 """ def __init__(self): self.articles_detail = rd.articles_detail() self.nickname = self.articles_detail[0]['nickname'] self.delay_criterion = 2.0 self.length = len(self.articles_detail) self.delay = round(self.delay_criterion / self.length, 3) self.length = 0 self.length = 0 self.data = [] self.nickname_table = DB(self.nickname) self.time_now = time.time() def parseHandleArticleList(self, filter=None, process=None): """ :param filter: 过滤器比如按照时间过滤 按照数量过滤 :param process: 前端进度显示实例 :return: 轮流调用list中的微信 获取所有的历史文章列表 """ offset = 0 flag = 1 cnt = 0 if 'load_more' in self.articles_detail[0]: while flag: while time.time() - self.time_now <= self.delay: time.sleep(0.05) self.time_now = time.time() article_list = Crawler(offset, self.articles_detail[cnt % self.length]).run() article_list = self.check(article_list, offset, cnt) flag = int(article_list['des']['can_msg_continue']) offset = int(article_list['des']['next_offset']) cnt += 1 self.data = article_list['data'] self.length += len(self.data) flag = self.checkFIlter(filter) self.length += len(self.data) crawler_log_table_instance.insert('id', {'id':self.nickname, 'num':self.length, 'nickname':self.nickname, 'time':datetime.now()}) process.reportCrawlNum(self.length) if self.save(self.data) == 'UPDATE': break if not flag: break time.sleep(self.delay) else: logger.warning('没有上滑加载更多历史文章') def save(self, l1l11111ll_wcplus_): """ :return: 保存数据 """ res = None res = self.nickname_table.insert('id', l1l11111ll_wcplus_) return res def checkFIlter(self, filter): """ :param filter: :return: 根据过滤器中的条件 决定继续还是结束文章列表的采集 True继续 false停止 """ if filter['type'] == 'true': if int(filter['num']) == 0: return True if self.length >= int(filter['num']): return False return True else: l11ll1llll_wcplus_ = [] res = True for a in self.data: l11ll1l1l1_wcplus_ = a['p_date'].timestamp() if l11ll1l1l1_wcplus_ >= filter['start_time'] and l11ll1l1l1_wcplus_ <= filter['end_time']: l11ll1llll_wcplus_.append(a) elif l11ll1l1l1_wcplus_ < filter['start_time']: res = False self.data = l11ll1llll_wcplus_ return res def check(self, flag, offset, cnt): """ :param list_data: 请求返回的结果 :param offset: :return: 带着本次请求的参数和结果一起过安检 请求失败导致安检不通过 安检提醒人重新操作手机 操作完之后再次发起请求 不排除还是会失败 继续调用自己 """ if flag != 'req_data_error': crawler_manager.check({'crawler': '历史文章列表', 'msg': 'success'}) else: crawler_manager.check({'crawler': '历史文章列表', 'msg': 'req_data_error'}) self.articles_detail = rd.articles_detail() while len(self.articles_detail) == 0: self.articles_detail = rd.articles_detail() from utils.front import sendNotification sendNotification('没有发现参数', '参数错误', _type='error') time.sleep(3) flag = Crawler(offset, self.articles_detail[0]).run() self.check(flag, offset, cnt) return flag
class l1l1ll11l_wcplus_: def __init__(self): self.l1l11ll1l_wcplus_ = rd.articles_detail() self.nickname = self.l1l11ll1l_wcplus_[0]['nickname'] self.l11ll1l1ll_wcplus_ = 3.0 self.l11ll1l111_wcplus_ = len(self.l1l11ll1l_wcplus_) self.delay = round(self.l11ll1l1ll_wcplus_ / self.l11ll1l111_wcplus_, 3) self.l11lll11l_wcplus_ = [] self.l11ll111l_wcplus_ = DB(self.nickname) self.l11ll1lll1_wcplus_ = time.time() def l1l1l1l11_wcplus_(self, process=None, mov=10): """ :param mov: 10~17 :return: 轮流调用wx_req_data_list中的微信参数 采集文章的阅读数据 """ if 'getappmsgext' in self.l1l11ll1l_wcplus_[0]: l11l1ll11l_wcplus_ = self.l11ll111l_wcplus_.table.find({ '$and': [{ 'read_num': { '$exists': False } }, { 'mov': { '$lte': int(mov) } }] }) cnt = 0 for a in l11l1ll11l_wcplus_: if 'mp.weixin.qq.com' in a['content_url']: if 'comment_id' not in a: a['comment_id'] = 0 self.l11lll11l_wcplus_.append( [cnt, a['content_url'], a['comment_id']]) cnt += 1 for l11l1ll111_wcplus_ in self.l11lll11l_wcplus_: while time.time() - self.l11ll1lll1_wcplus_ <= self.delay: time.sleep(0.05) self.l11ll1lll1_wcplus_ = time.time() l1ll1l1l1_wcplus_ = l11lll1l1l_wcplus_( l11l1ll111_wcplus_[1], l11l1ll111_wcplus_[2], self.l1l11ll1l_wcplus_[l11l1ll111_wcplus_[0] % self.l11ll1l111_wcplus_]).run() l1ll1l1l1_wcplus_ = self.check(l1ll1l1l1_wcplus_, l11l1ll111_wcplus_) l1ll1l1l1_wcplus_['id'] = encryptString(l11l1ll111_wcplus_[1]) self.l11ll111l_wcplus_.insert('id', l1ll1l1l1_wcplus_) process.l11l1lll1_wcplus_(l11l1ll111_wcplus_[0] + 1, len(self.l11lll11l_wcplus_), self.delay) else: logger.warning('点击查看该公众号的任意一篇文章且出现阅读量') def save(self, l1ll1l1l1_wcplus_): """ :param reading_data: :return: 保存数据 """ pass def l11111l11_wcplus_(self): """ :return: 多线程的方式准备任务 """ for item in self.l11lll11l_wcplus_: yield {'index': item[0], 'url': item[1]} def l1llllll1l_wcplus_(self, task): """ :return: 多线程的方式任务处理器 """ l11lll1l1l_wcplus_( task['url'], self.l1l11ll1l_wcplus_[task['index'] % self.l11ll1l111_wcplus_]).run() def check(self, l1ll1l1l1_wcplus_, item): """ :return: 带着本次请求的参数和结果一起过安检 请求失败导致安检不通过 安检提醒人重新操作手机 操作完之后再次发起请求 不排除还是会失败 继续调用自己 反正想办法让其获得成功的请求 最后返回成功的请求 """ if l1ll1l1l1_wcplus_ != 'req_data_error': crawler_manager.check({'crawler': '阅读数据', 'msg': 'success'}) else: crawler_manager.check({'crawler': '阅读数据', 'msg': 'req_data_error'}) self.l1l11ll1l_wcplus_ = rd.articles_detail() while len(self.l1l11ll1l_wcplus_) == 0: self.l1l11ll1l_wcplus_ = rd.articles_detail() from utils.front import sendNotification sendNotification('没有发现参数', '参数错误', _type='error') time.sleep(3) l1ll1l1l1_wcplus_ = l11lll1l1l_wcplus_( item[1], item[2], self.l1l11ll1l_wcplus_[0]).run() self.check(l1ll1l1l1_wcplus_, item) return l1ll1l1l1_wcplus_