def prepare_docs(self, num=None): """ :param num: 如果是具体数字则 准备最近发布的num篇文章 :return: 根据公众号的昵称准备该公众号的所有或者前n篇文章的全部数据 如果某些字段没有就使用默认值 """ from pymongo import DESCENDING doc_list = [] # 从数据库中找出文章列表 col = CollectionOperation(self.nickname_raw) if num: db_docs = col.table.find().sort("p_date", DESCENDING)()[:num] else: db_docs = col.get() begin_time = time.time() # 根据 doc_schema 中 key 构建doc list for doc in db_docs: item = {} doc['id'] = doc['content_url'] for key in doc_schema: if key in doc: item[key] = doc[key] # 如果数据库中没有该字段使用-2填充 else: item[key] = -2 doc_list.append(item) logger.info('解析文章文本用时 %.3f'%(time.time()-begin_time)) return doc_list
def get(self): """ :return:获取所有公众号列表 """ info_data = [] gzh_num = 0 total_article_num = 0 gzhs = col_crawler_log.get() cnt = 1 for i in gzhs: gzh_num += 1 col_data = CollectionOperation(i['nickname']) table_line_data = {} # 获取文章总数 total_num = col_data.count() total_article_num += total_num # 获取含有阅读数据的文章总数 article_num = col_data.count(read_num={'$gt': -2}) # 生成一行表格的数据 table_line_data['id'] = cnt table_line_data['nickname'] = i['nickname'] table_line_data['total_articles'] = total_num table_line_data['reading_data_articles'] = article_num table_line_data['time'] = i['time'].timestamp() cnt += 1 info_data.append(table_line_data) return { 'finished': info_data, 'stat_data': { 'gzh_num': gzh_num, 'article_num': total_article_num } }
def __init__(self): self.wx_req_data_list = rd.tidy() self.nickname = self.wx_req_data_list[0]['nickname'] self.every_delay = 3.0 self.wx_num = len(self.wx_req_data_list) self.delay = round(self.every_delay / self.wx_num, 3) self.articles = [] self.col_data = CollectionOperation(self.nickname) self.pre_crawl_time = time.time()
def delete_like(article_info): """ :param article_info:{content_irl} :return: 删除搜藏 """ # 从喜欢列表中删除 col_like.delete(content_url=article_info['content_url']) # 将原始数据库的like记录改为FALSE col_data = CollectionOperation(article_info['nickname']) article_data = col_data.get(content_url=article_info['content_url'])[0] article_data['like_folder'] = False col_data.insert(key='content_url', data=article_data)
def run(self, command, cmd_args): from app.api.gzh import Finished from cmp.db.mongo import CollectionOperation for gzh in Finished().get()['finished']: col = CollectionOperation(gzh['nickname']) articles_buffer = [] for a in col.get(): if type(a['mov']) == str: a['mov'] = int(a['mov']) articles_buffer.append(a) col.insert('id', articles_buffer) logger.info('转化完成 %d %s'%(len(articles_buffer), gzh['nickname'])) command_log('转化完成 %d %s'%(len(articles_buffer), gzh['nickname']))
def add_like(article_info): """ :param article_info: {nickname, content_url} :return: 添加到搜藏 """ # 根据nickname和url从数据库中获得原始记录 col_data = CollectionOperation(article_info['nickname']) article_data = col_data.get(content_url=article_info['content_url'])[0] # 原始数据库中增加已经搜藏字段 article_data['like_folder'] = True col_data.insert(key='content_url', data=article_data) # 增加收藏时间 article_data['like_time'] = datetime.now() # 插入 "微搜收藏" res = col_like.insert(key='content_url', data=article_data)
def get_html_doc(nickname, md5): """ :param filename: :return: 返回一个静动态文件 """ from cmp.db.mongo import CollectionOperation if CollectionOperation(nickname).count(id=md5, comment_id={'$exists': True}): from webbrowser import open import os if PLATFORM == 'win': file_name = os.getcwd( ) + r'\\web_server\\static\\html\\' + nickname + '\\' + md5 + '.html' if os.path.isfile(file_name): open(file_name) else: return '找不到该文章 可能是没有迁移到新版本的WCplus 请先从旧版本的WCplus中复制或移动到新版本的WCplus的web_server/static/html目录下' else: file_name = os.getcwd( ) + r'/web_server/static/html/' + nickname + '/' + md5 + '.html' if os.path.isfile(file_name): open('file://' + file_name) else: return '找不到该文章 可能是没有迁移到新版本的WCplus 请先从旧版本的WCplus中复制或移动到新版本的WCplus的web_server/static/html目录下' return ('', 204) else: return '未保存该文章 请先采集'
def __init__(self): # 从数据源获取的请求参数 self.wx_req_data_list = rd.tidy() # 微信昵称 self.nickname = self.wx_req_data_list[0]['nickname'] # 同一个微信账号两次请求之间的时间间隔 self.every_delay = 3.0 # 参加采集微信的数量 self.wx_num = len(self.wx_req_data_list) # 多个微信账号的情况下时间间隔 self.delay = round(self.every_delay/self.wx_num, 3) # 所有需要采集的文章 self.articles = [] # 数据库操作 self.col_data = CollectionOperation(self.nickname) # 上一次请求时间 self.pre_crawl_time = time.time()
def get_article_list(self, page_info): """ :param page_info: {'nickname','start','end'} :return: 返回一个公众号的全部文章列表 """ col_data = CollectionOperation(page_info['nickname']) info_data = [] cnt = 1 articles = col_data.get()[int(page_info['start']):int(page_info['end'] )] for a in articles: item = {} item['id'] = cnt item['mov'] = a['mov'] if 'read_num' in a: item['read'] = a['read_num'] else: item['read'] = '-' if 'like_num' in a: item['like'] = a['like_num'] else: item['like'] = '-' if 'reward_num' in a: item['reward'] = a['reward_num'] else: item['reward'] = '-' if 'comment_num' in a: item['comment'] = a['comment_num'] else: item['comment'] = '-' item['date'] = a['p_date'].timestamp() item['title'] = a['title'] item['url'] = a['content_url'] item['md5'] = get_md5(a['content_url']) cnt += 1 info_data.append(item) return info_data
def get_article_list(self, page_info, **kwargs): """ :param page_info: {'nickname','start','end'} :return: 返回一个公众号的全部文章列表 """ from pymongo import DESCENDING col_data = CollectionOperation(page_info['nickname']) info_data = [] cnt = 1 # 返回文章切片 按时间逆向排序 越靠近现在越靠前 articles = col_data.table.find(kwargs).sort( "p_date", DESCENDING)[int(page_info['start']):int(page_info['end'])] for a in articles: item = {} item['id'] = cnt item['mov'] = str(a['mov']) if 'read_num' in a: item['read'] = a['read_num'] else: item['read'] = '-' if 'like_num' in a: item['like'] = a['like_num'] else: item['like'] = '-' if 'reward_num' in a: item['reward'] = a['reward_num'] else: item['reward'] = '-' if 'comment_num' in a: item['comment'] = a['comment_num'] else: item['comment'] = '-' item['like_folder'] = 0 if ('like_folder' in a) and (a['like_folder']): item['like_folder'] = 1 item['date'] = a['p_date'].timestamp() item['title'] = a['title'] item['url'] = a['content_url'] item['md5'] = get_md5(a['content_url']) cnt += 1 info_data.append(item) return info_data
def get_html_doc(nickname, md5): """ :param filename: :return: 返回一个静动态文件 """ from cmp.db.mongo import CollectionOperation if CollectionOperation(nickname).count(id=md5, comment_id={'$exists': True}): from webbrowser import open import os file_name = os.getcwd( ) + '/web_server/static/html/' + nickname + '/' + md5 + '.html' open(file_name) return ('', 204) return '未保存该文章 请先采集'
def get_all_article(worker_num=5, process=None): global article_data_buffer global col_data global front_process global nickname front_process = process article_data_buffer = [] from instance import rd nickname = rd.tidy()[0]['nickname'] col_data = CollectionOperation(nickname) rc = RequestContent() rc.prepare_articles(nickname, worker_num=16, ip_num=1, need_proxy=use_proxy_directly()) rc.run_crawlers() rc.join_cralwers() TaskRecoder.print_ts()
class AricleList: """ 优雅地拿下一个公众号的全部历史文章列表 如果有必要直接调用自动操作手机的方法 采集完毕之后结束对象的生命周期 """ def __init__(self): self.wx_req_data_list = rd.tidy() self.nickname = self.wx_req_data_list[0]['nickname'] self.every_delay = 2.0 self.wx_num = len(self.wx_req_data_list) self.delay = round(self.every_delay / self.wx_num, 3) self.article_num = 0 self.all_article_num = 0 self.current_article_list = [] self.col_data = CollectionOperation(self.nickname) self.pre_crawl_time = time.time() def get_all_article_list(self, filter=None, process=None): """ :param filter: 过滤器比如按照时间过滤 按照数量过滤 :param process: 前端进度显示实例 :return: 轮流调用list中的微信 获取所有的历史文章列表 """ offset = 0 can_msg_continue = 1 cnt = 0 if 'load_more' in self.wx_req_data_list[0]: while can_msg_continue: while time.time() - self.pre_crawl_time <= self.delay: time.sleep(0.05) self.pre_crawl_time = time.time() list_data = Crawler( offset, self.wx_req_data_list[cnt % self.wx_num]).run() list_data = self.check(list_data, offset, cnt) can_msg_continue = int(list_data['des']['can_msg_continue']) offset = int(list_data['des']['next_offset']) cnt += 1 self.current_article_list = list_data['data'] self.article_num += len(self.current_article_list) filter_res = self.filter_check(filter) self.all_article_num += len(self.current_article_list) col_crawler_log.insert( 'id', { 'id': self.nickname, 'num': self.all_article_num, 'nickname': self.nickname, 'time': datetime.now() }) process.new_article_list(self.all_article_num) if self.save(self.current_article_list) == 'UPDATE': break if not filter_res: break time.sleep(self.delay) else: logger.warning('没有上滑加载更多历史文章') def save(self, list_data): """ :return: 保存数据 """ res = None res = self.col_data.insert('id', list_data) return res def filter_check(self, filter): """ :param filter: :return: 根据过滤器中的条件 决定继续还是结束文章列表的采集 True继续 false停止 """ if filter['type'] == 'true': if int(filter['num']) == 0: return True if self.article_num >= int(filter['num']): return False return True else: use_article_list = [] res = True for a in self.current_article_list: p_date_timestamp = a['p_date'].timestamp() if p_date_timestamp >= filter[ 'start_time'] and p_date_timestamp <= filter[ 'end_time']: use_article_list.append(a) elif p_date_timestamp < filter['start_time']: res = False self.current_article_list = use_article_list return res def check(self, list_data, offset, cnt): """ :param list_data: 请求返回的结果 :param offset: :return: 带着本次请求的参数和结果一起过安检 请求失败导致安检不通过 安检提醒人重新操作手机 操作完之后再次发起请求 不排除还是会失败 继续调用自己 """ if list_data != 'req_data_error': stop_and_start.check({'crawler': '历史文章列表', 'msg': 'success'}) else: stop_and_start.check({ 'crawler': '历史文章列表', 'msg': 'req_data_error' }) self.wx_req_data_list = rd.tidy() while len(self.wx_req_data_list) == 0: self.wx_req_data_list = rd.tidy() from utils.front import notification notification('没有发现参数', '参数错误', _type='error') time.sleep(3) list_data = Crawler(offset, self.wx_req_data_list[0]).run() self.check(list_data, offset, cnt) return list_data
# uncompyle6 version 3.2.6 # Python bytecode 3.6 (3379) # Decompiled from: Python 3.6.6 (default, Jan 26 2019, 16:53:05) # [GCC 4.8.5 20150623 (Red Hat 4.8.5-36)] # Embedded file name: instance\__init__.py """ 定义全局对象 """ from cmp.db.mongo import CollectionOperation col_crawler_log = CollectionOperation('crawler_log') col_req_data = CollectionOperation('req_data') from app.weixin_crawler.req_data import ReqData rd = ReqData() from app.api.settings import Settings user_settings = Settings() from app.weixin_crawler import Stop stop_and_start = Stop() from utils.base import the_platform PLATFORM = the_platform()
class ReadingData: def __init__(self): self.wx_req_data_list = rd.tidy() self.nickname = self.wx_req_data_list[0]['nickname'] self.every_delay = 3.0 self.wx_num = len(self.wx_req_data_list) self.delay = round(self.every_delay / self.wx_num, 3) self.articles = [] self.col_data = CollectionOperation(self.nickname) self.pre_crawl_time = time.time() def get_all_reading_data(self, filter=None, process=None): """ :param filter: :return: 轮流调用wx_req_data_list中的微信参数 采集文章的阅读数据 """ if 'getappmsgext' in self.wx_req_data_list[0]: raw_articles = self.col_data.get(read_num={'$exists': False}) cnt = 0 for a in raw_articles: if 'mp.weixin.qq.com' in a['content_url']: if 'comment_id' not in a: a['comment_id'] = 0 self.articles.append( [cnt, a['content_url'], a['comment_id']]) cnt += 1 for itme in self.articles: while time.time() - self.pre_crawl_time <= self.delay: time.sleep(0.05) self.pre_crawl_time = time.time() reading_data = Crawler( itme[1], itme[2], self.wx_req_data_list[itme[0] % self.wx_num]).run() reading_data = self.check(reading_data, itme) reading_data['id'] = get_md5(itme[1]) self.col_data.insert('id', reading_data) process.new_reading_data(itme[0] + 1, len(self.articles), self.delay) else: logger.warning('点击查看该公众号的任意一篇文章且出现阅读量') def save(self, reading_data): """ :param reading_data: :return: 保存数据 """ pass def prepare_task(self): """ :return: 多线程的方式准备任务 """ for item in self.articles: yield {'index': item[0], 'url': item[1]} def task_handler(self, task): """ :return: 多线程的方式任务处理器 """ Crawler(task['url'], self.wx_req_data_list[task['index'] % self.wx_num]).run() def check(self, reading_data, item): """ :return: 带着本次请求的参数和结果一起过安检 请求失败导致安检不通过 安检提醒人重新操作手机 操作完之后再次发起请求 不排除还是会失败 继续调用自己 反正想办法让其获得成功的请求 最后返回成功的请求 """ if reading_data != 'req_data_error': stop_and_start.check({'crawler': '阅读数据', 'msg': 'success'}) else: stop_and_start.check({'crawler': '阅读数据', 'msg': 'req_data_error'}) self.wx_req_data_list = rd.tidy() while len(self.wx_req_data_list) == 0: self.wx_req_data_list = rd.tidy() from utils.front import notification notification('没有发现参数', '参数错误', _type='error') time.sleep(3) reading_data = Crawler(item[1], item[2], self.wx_req_data_list[0]).run() self.check(reading_data, item) return reading_data
class ReadingData(): def __init__(self): # 从数据源获取的请求参数 self.wx_req_data_list = rd.tidy() # 微信昵称 self.nickname = self.wx_req_data_list[0]['nickname'] # 同一个微信账号两次请求之间的时间间隔 self.every_delay = 3.0 # 参加采集微信的数量 self.wx_num = len(self.wx_req_data_list) # 多个微信账号的情况下时间间隔 self.delay = round(self.every_delay/self.wx_num, 3) # 所有需要采集的文章 self.articles = [] # 数据库操作 self.col_data = CollectionOperation(self.nickname) # 上一次请求时间 self.pre_crawl_time = time.time() def get_all_reading_data(self, process=None, mov=10): """ :param mov: 10~17 :return: 轮流调用wx_req_data_list中的微信参数 采集文章的阅读数据 """ # 获取所有需要采集的文章url # 将url等参数传递给新建的爬虫对象 # 保存数据 if 'getappmsgext' in self.wx_req_data_list[0]: # 从书库库获取需要采集的文章列表 # raw_articles = self.col_data.get(read_num={"$exists": False}) # 选中没有阅读数据且位置低于mov的文章来采集阅读数据 raw_articles = self.col_data.table.find({"$and":[ {"read_num":{"$exists": False}}, {"mov":{"$lte": int(mov)}}]}) # 采集阅读数据需要较长时间 防止长时间占用数据库游标 缓存需要采集的文章列表 cnt = 0 for a in raw_articles: # [cnt, url, comment_id] if "mp.weixin.qq.com" in a['content_url']: # 在采集文章正文之前采集阅读数据 这时 并没有comment_id if 'comment_id' not in a: a['comment_id'] = 0 self.articles.append([cnt, a['content_url'], a['comment_id']]) cnt += 1 # 一个一个开始采集 for itme in self.articles: while time.time()-self.pre_crawl_time <= self.delay: time.sleep(0.05) self.pre_crawl_time = time.time() reading_data = Crawler(itme[1], itme[2], self.wx_req_data_list[itme[0]%self.wx_num]).run() # 开始安检 使用安检之后的数据 因为它一定是合格的数据 reading_data = self.check(reading_data, itme) # 安检通过 reading_data['id'] = get_md5(itme[1]) self.col_data.insert('id', reading_data) # 发送进度数据给前端 process.new_reading_data(itme[0]+1, len(self.articles), self.delay) # 使用多线程 同时采集所有的文章 测试证明不可行 容易被限制 # from cmp.mt import run_mt # run_mt(len(self.articles), self.prepare_task, self.task_handler) else: logger.warning('点击查看该公众号的任意一篇文章且出现阅读量') def save(self, reading_data): """ :param reading_data: :return: 保存数据 """ pass def prepare_task(self): """ :return: 多线程的方式准备任务 """ for item in self.articles: yield {'index':item[0], 'url':item[1]} def task_handler(self, task): """ :return: 多线程的方式任务处理器 """ Crawler(task['url'], self.wx_req_data_list[task['index']%self.wx_num]).run() def check(self, reading_data, item): """ :return: 带着本次请求的参数和结果一起过安检 请求失败导致安检不通过 安检提醒人重新操作手机 操作完之后再次发起请求 不排除还是会失败 继续调用自己 反正想办法让其获得成功的请求 最后返回成功的请求 """ if reading_data != 'req_data_error': stop_and_start.check({'crawler': '阅读数据', 'msg': 'success'}) else: # 先过安检 安检会提醒更新参数 stop_and_start.check({'crawler': '阅读数据', 'msg': 'req_data_error'}) # 参数更新完毕 从数据源读取参数 继续请求 self.wx_req_data_list = rd.tidy() # 参数有可能被用户删除了 循环检查 while len(self.wx_req_data_list) == 0: self.wx_req_data_list = rd.tidy() from utils.front import notification notification('没有发现参数','参数错误',_type='error') time.sleep(3) reading_data = Crawler(item[1], item[2], self.wx_req_data_list[0]).run() # 继续安检 self.check(reading_data, item) return reading_data
def prepare_data(self): data_gen = CollectionOperation(self.nickname).get() return data_gen
""" 提供数据设置API """ from cmp.db.mongo import CollectionOperation col_settings = CollectionOperation('settings') class Settings(): def __int__(self): pass def get(self): """ :return: 获取所有的设置字段{} """ sd = col_settings.get() settings_data = {} for s in sd: settings_data[s['key']] = s['value'] # 注入代理ip地址 from utils.network import get_ip settings_data['proxy_server'] = get_ip() return settings_data def insert(self, settings_data_dict): """ :param settings_data_dict: settings数据本质上是一个字典 :return: 插入或修改 """ # 将dict转化为list 例如 {'name':'Frank Wang', 'age':18} -> [{'key':'name', 'value':'Frank Wang'},{'key':'age', 'value':18}] settings_data_list = []
class AricleList(): """ 优雅地拿下一个公众号的全部历史文章列表 如果有必要直接调用自动操作手机的方法 采集完毕之后结束对象的生命周期 """ def __init__(self): # 从数据源获取的请求参数 self.wx_req_data_list = rd.tidy() # 微信昵称 self.nickname = self.wx_req_data_list[0]['nickname'] # 同一个微信账号两次请求之间的时间间隔 self.every_delay = 2.0 # 参加采集微信的数量 self.wx_num = len(self.wx_req_data_list) # 多个微信账号的情况下时间间隔 self.delay = round(self.every_delay/self.wx_num, 3) # 已经采集文章的数量 self.article_num = 0 # 所有已经采集的文章数 self.all_article_num = 0 # 当前的列表 self.current_article_list = [] # 数据库操作 self.col_data = CollectionOperation(self.nickname) # 上一次请求时间 self.pre_crawl_time = time.time() def get_all_article_list(self, filter=None,process=None): """ :param filter: 过滤器比如按照时间过滤 按照数量过滤 :param process: 前端进度显示实例 :return: 轮流调用list中的微信 获取所有的历史文章列表 """ offset = 0 can_msg_continue = 1 cnt = 0 if 'load_more' in self.wx_req_data_list[0]: while can_msg_continue: while time.time()-self.pre_crawl_time <= self.delay: time.sleep(0.05) self.pre_crawl_time = time.time() list_data = Crawler(offset, self.wx_req_data_list[cnt%self.wx_num]).run() # 安检开始 不合格会卡住 使用安检之后的数据 因为它一定是合格的数据 list_data = self.check(list_data, offset, cnt) # 安检通过进行后续处理 can_msg_continue = int(list_data['des']['can_msg_continue']) offset = int(list_data['des']['next_offset']) cnt += 1 self.current_article_list = list_data['data'] self.article_num += len(self.current_article_list) filter_res = self.filter_check(filter) self.all_article_num += len(self.current_article_list) # 写入文章采集日志 col_crawler_log.insert('id', {'id': self.nickname, 'num': self.all_article_num, 'nickname': self.nickname, 'time': datetime.now()}) # 保存数据 process.new_article_list(self.all_article_num) if self.save(self.current_article_list) == 'UPDATE': break # 不符合过滤器条件结束 if not filter_res: break # 过滤器检查 time.sleep(self.delay) else: logger.warning('没有上滑加载更多历史文章') def save(self, list_data): """ :return: 保存数据 """ res = None res = self.col_data.insert('id', list_data) return res def filter_check(self, filter): """ :param filter: :return: 根据过滤器中的条件 决定继续还是结束文章列表的采集 True继续 false停止 """ # 按照量控制采集 if filter['type'] == 'true': # 0表示采集全部文章 if int(filter['num'])==0: return True # 文章数已经超过过滤器设置的数量 if self.article_num >= int(filter['num']): return False # 文章数还没达到过滤器的设定值 else: return True # 按照时间控制 else: use_article_list = [] res = True # 根据日期删除部分文章 for a in self.current_article_list: p_date_timestamp = a['p_date'].timestamp() # 小于开始时间丢掉 继续 # if p_date_timestamp<filter['start_time']: # pass # 介于之间 保留继续 if (p_date_timestamp>=filter['start_time']) and (p_date_timestamp<=filter['end_time']): use_article_list.append(a) # 大于结束时间 停止 elif p_date_timestamp<filter['start_time']: res = False self.current_article_list = use_article_list return res def check(self, list_data, offset, cnt): """ :param list_data: 请求返回的结果 :param offset: :return: 带着本次请求的参数和结果一起过安检 请求失败导致安检不通过 安检提醒人重新操作手机 操作完之后再次发起请求 不排除还是会失败 继续调用自己 """ if list_data != 'req_data_error': stop_and_start.check({'crawler': '历史文章列表', 'msg': 'success'}) else: # 先过安检 安检会提醒更新参数 stop_and_start.check({'crawler': '历史文章列表', 'msg': 'req_data_error'}) # 参数更新完毕 从数据源读取参数 继续请求 self.wx_req_data_list = rd.tidy() # 参数有可能被用户删除了 循环检查 while len(self.wx_req_data_list) == 0: self.wx_req_data_list = rd.tidy() from utils.front import notification notification('没有发现参数','参数错误',_type='error') time.sleep(3) list_data = Crawler(offset, self.wx_req_data_list[0]).run() # 继续安检 self.check(list_data, offset, cnt) return list_data