def response(self, l1lll1111l1_wcplus_): global l11l11llll_wcplus_ for key in l1lll111111_wcplus_: if l1lll111111_wcplus_[key] in l1lll1111l1_wcplus_.request.url: l1l1l1111_wcplus_, timestamp = l1lll111lll_wcplus_.l1lll1111ll_wcplus_( l1lll1111l1_wcplus_.request) if key == 'home': l11l11llll_wcplus_ = self._1ll1llll1l_wcplus_( l1l1l1111_wcplus_) if l11l11llll_wcplus_ == 'UNK': return key_name = '%s.%s.req' % (l11l11llll_wcplus_, key) l1lll11ll11_wcplus_(key_name, l1l1l1111_wcplus_) logger.debug(key_name) if key == 'getappmsgext': status_code, text = l1lll111lll_wcplus_.get_response( l1lll1111l1_wcplus_.response) l1lll11l1ll_wcplus_ = json.loads(text) l11l1l111l_wcplus_ = 'UNK' if 'nick_name' in l1lll11l1ll_wcplus_: l11l1l111l_wcplus_ = l1lll11l1ll_wcplus_['nick_name'] if l11l1l111l_wcplus_ == 'UNK': logger.debug('没能找到微信昵称 换一篇文章点击试试看 确保文章底部阅读数据出现') else: l1lll11ll11_wcplus_(l11l1l111l_wcplus_ + '.nick_name', l11l11llll_wcplus_) elif key == 'home': status_code, l1lll11lll1_wcplus_ = l1lll111lll_wcplus_.get_response( l1lll1111l1_wcplus_.response) l1lll11l1l1_wcplus_ = l1lll11lll1_wcplus_.split( 'var nickname = "')[1].split('" || ""')[0] logger.info('准备公众号:' + l1lll11l1l1_wcplus_) l1lll11ll11_wcplus_('current_nickname', l1lll11l1l1_wcplus_)
def response(self, flow): for key in url_filter: if url_filter[key] in flow.request.url: req_data, timestamp = ExtractFlow.format_request_data( flow.request) wxuin = self._extract_wxuin(req_data) if wxuin == 'UNK': return key_name = '%s.%s.req' % (wxuin, key) insert_helper(key_name, req_data) logger.debug(key_name) if key == 'getappmsgext': status_code, text = ExtractFlow.get_response(flow.response) text_dict = json.loads(text) nick_name = 'UNK' if 'nick_name' in text_dict: nick_name = text_dict['nick_name'] if nick_name == 'UNK': logger.debug('没能找到微信昵称 换一篇文章点击试试看 确保文章底部阅读数据出现') else: insert_helper(nick_name + '.nick_name', wxuin) elif key == 'home': status_code, html_text = ExtractFlow.get_response( flow.response) current_nickname = html_text.split( 'var nickname = "')[1].split('" || ""')[0] logger.info('准备公众号:' + current_nickname) insert_helper('current_nickname', current_nickname)
def response(self, flow): # 检查是否是过滤器中的url for key in url_filter: if url_filter[key] in flow.request.url: # 请求参数进行格式转化 得到请求参数和时间戳 req_data,timestamp = ExtractFlow.format_request_data(flow.request) # debug_p(req_data) global wxuin if key == 'home': wxuin = self._extract_wxuin(req_data) # 没有获取到微信昵称不保存参数 if wxuin == 'UNK': return key_name = '%s.%s.req'%(wxuin, key) insert_helper(key_name,req_data) logger.debug(key_name) # 获取当前微信的昵称和微信的wxuin 以此支持多微信同时采集 if key == 'getappmsgext': # 找出当前微信昵称 status_code,text = ExtractFlow.get_response(flow.response) text_dict = json.loads(text) nick_name = 'UNK' if 'nick_name' in text_dict: nick_name = text_dict['nick_name'] if nick_name == 'UNK': logger.debug('没能找到微信昵称 换一篇文章点击试试看 确保文章底部阅读数据出现') else: insert_helper(nick_name+'.nick_name', wxuin) elif key == 'home': status_code, html_text = ExtractFlow.get_response(flow.response) current_nickname = html_text.split('var nickname = "')[1].split('" || ""')[0] logger.info('准备公众号:'+current_nickname) insert_helper('current_nickname',current_nickname)
def create_index(self): """ :return: 创建index 如果已经存在则不创建 返回是创建还是已经存 create index 实际上是告诉 es 哪些需要索引 哪些不需要索引 """ mapping_body = {} mapping_body['properties'] = doc_schema exists = es_instance.indices.exists(self.index_name) if exists is False: es_instance.indices.create(self.index_name) es_instance.indices.put_mapping(index=self.index_name, doc_type=self.doc_type, body=mapping_body) logger.debug('创建index %s 成功'%(self.index_name)) else: logger.debug('index %s 已经存在'%(self.index_name)) return exists
def __init__(self, nickname, task_q, ip_q): """ :param nickname: 需要爬取公众号文章的昵称 :param task_q: 全局可见任务队列 :param ip_q: 全局可见代理IP队列 """ self.task_q = task_q self.ip_q = ip_q self.nickname = nickname self.articles_gen = col_data.get(comment_id={'$exists': False}) self.articles_num = col_data.count(comment_id={'$exists': False}) self.task_num = 0 logger.debug('%s共有%d篇文章' % (self.nickname, self.articles_num)) self.begin_time = time.time() self.is_one_proxy = None self.worker_num = None
def save_html_as_file(nickname, file_name, html_str): """ :param nickname: 公众号的昵称用作文件夹 :param html_str: html内容 :param file_name: 文件名 :return: 将文章的html内容保存为文件 存储在web_server的static/html/nickname路径下 """ path = './web_server/static/html/' + nickname + '/' if not os.path.exists(path): os.makedirs(path) whole_name = path + file_name + '.html' if not os.path.isfile(whole_name): file = codecs.open(whole_name, 'w', 'utf-8') file.write(html_str.replace('data-src', 'src')) file.close() else: logger.debug('HTML已存在 %s' % whole_name)
def create_index(self): """ :return: 创建index 如果已经存在则不创建 返回是创建还是已经存 create index 实际上是告诉 es 哪些需要索引 哪些不需要索引 """ l11l11l111_wcplus_ = {} l11l11l111_wcplus_['properties'] = l1llll1lll_wcplus_ exists = l11l111ll1_wcplus_.indices.exists(self.index_name) if exists is False: l11l111ll1_wcplus_.indices.create(self.index_name) l11l111ll1_wcplus_.indices.put_mapping(index=self.index_name, doc_type=self.doc_type, body=l11l11l111_wcplus_) logger.debug('创建index %s 成功' % self.index_name) else: logger.debug('index %s 已经存在' % self.index_name) return exists
def l1l11ll111_wcplus_(nickname, file_name, l1l1l11l11_wcplus_): """ :param nickname: 公众号的昵称用作文件夹 :param html_str: html内容 :param file_name: 文件名 :return: 将文章的html内容保存为文件 存储在web_server的static/html/nickname路径下 """ path = './web_server/static/html/' + nickname + '/' if not os.path.exists(path): os.makedirs(path) l1ll1l1ll1_wcplus_ = path + file_name + '.html' if not os.path.isfile(l1ll1l1ll1_wcplus_): file = codecs.open(l1ll1l1ll1_wcplus_, 'w', 'utf-8') file.write(l1l1l11l11_wcplus_.replace('data-src', 'src')) file.close() else: logger.debug('HTML已存在 %s' % l1ll1l1ll1_wcplus_)
def __init__(self, nickname, l1l11l111l_wcplus_, l1l111l1l1_wcplus_): """ :param nickname: 需要爬取公众号文章的昵称 :param task_q: 全局可见任务队列 :param ip_q: 全局可见代理IP队列 """ self.l1l11l111l_wcplus_ = l1l11l111l_wcplus_ self.l1l111l1l1_wcplus_ = l1l111l1l1_wcplus_ self.nickname = nickname self.l1l1ll111l_wcplus_ = l11ll111l_wcplus_.get( article={'$exists': False}) self.l1l1lll11l_wcplus_ = l11ll111l_wcplus_.count( article={'$exists': False}) self.l1l1l1lll1_wcplus_ = 0 logger.debug('%s共有%d篇文章' % (self.nickname, self.l1l1lll11l_wcplus_)) self.begin_time = time.time() self.l1l1l111l1_wcplus_ = None self.l1ll11lll_wcplus_ = None
def create_task_q(self): """ :return: 创建任务队列 将待爬取的文章加入任务队列 """ index = 0 for a in self.articles_gen: if 'mp.weixin.qq.com' in a['content_url']: index += 1 task = { 'nickname': a['nickname'], 'title': a['title'], 'content_url': a['content_url'], 'id': index } self.task_q.put(task, block=True, timeout=None) self.task_num = index logger.debug('%s共有%d篇文章需要爬取' % (self.nickname, self.task_num)) TaskRecoder.set_total_task_num(self.task_num) return self.task_num
def l1l1ll1l1l_wcplus_(self): """ :return: 创建任务队列 将待爬取的文章加入任务队列 """ index = 0 for a in self.l1l1ll111l_wcplus_: if 'mp.weixin.qq.com' in a['content_url']: index += 1 task = { 'nickname': a['nickname'], 'title': a['title'], 'content_url': a['content_url'], 'id': index } self.l1l11l111l_wcplus_.put(task, block=True, timeout=None) self.l1l1l1lll1_wcplus_ = index logger.debug('%s共有%d篇文章需要爬取' % (self.nickname, self.l1l1l1lll1_wcplus_)) l1l1ll1l11_wcplus_.l1l1111lll_wcplus_(self.l1l1l1lll1_wcplus_) return self.l1l1l1lll1_wcplus_
def l1ll1l1l11_wcplus_(self, l1l1111ll1_wcplus_=1, proxy=False): """ :param ip_num: 代理的数量 :param proxy: 是否需要使用代理否者代理IP全部为 127.0.0.1:80 如果proxy为False代理ip默认为一个127.0.0.1:80 :return: 初始化代理IP队列 队列只有一个任务 数据类型为dict {'ips':[{'ip':ip,'delay':0,'alive_time':0,'cunter':0},{***},{***}], 'next_ip':0} ips是全部可用ip的list next_ip有上一次使用的进程更新 下一个进程使用其确定使用哪个ip """ ips = [] self.l1l1l111l1_wcplus_ = True if proxy == False: ips = '127.0.0.1:80' else: if l1l1111ll1_wcplus_ == 1: ips = l1ll1l1111_wcplus_() else: ips = [l1ll1l1111_wcplus_() for i in range(l1l1111ll1_wcplus_)] self.l1l1l111l1_wcplus_ = False if type(ips) == str: ips = [ips] l1l1lllll1_wcplus_ = {} l1l1lllll1_wcplus_['ips'] = [] l1l1lllll1_wcplus_['next_ip'] = 0 for ip in ips: logger.debug(ip) l1l1lllll1_wcplus_['ips'].append({ 'ip': ip, 'delay': 0, 'alive_time': 0, 'cunter': 0 }) l1l1ll1l11_wcplus_.l1l111l11l_wcplus_( ip=ip, l1l11lllll_wcplus_=time.time()) self.l1l111l1l1_wcplus_.put(l1l1lllll1_wcplus_, block=True, timeout=None) logger.debug(l1l1lllll1_wcplus_) logger.debug('代理IP队列已经初始化完毕 共有%d个代理代理IP' % self.l1l111l1l1_wcplus_.qsize())
def init_proxy_ips(self, ip_num=1, proxy=False): """ :param ip_num: 代理的数量 :param proxy: 是否需要使用代理否者代理IP全部为 127.0.0.1:80 如果proxy为False代理ip默认为一个127.0.0.1:80 :return: 初始化代理IP队列 队列只有一个任务 数据类型为dict {'ips':[{'ip':ip,'delay':0,'alive_time':0,'cunter':0},{***},{***}], 'next_ip':0} ips是全部可用ip的list next_ip有上一次使用的进程更新 下一个进程使用其确定使用哪个ip """ ips = [] self.is_one_proxy = True if proxy == False: ips = '127.0.0.1:80' else: if ip_num == 1: ips = get_proxy_ip(ips=1) else: ips = [get_proxy_ip(ips=1) for i in range(ip_num)] self.is_one_proxy = False if type(ips) == str: ips = [ips] ip_queue_data = {} ip_queue_data['ips'] = [] ip_queue_data['next_ip'] = 0 print('#' * 100, ips) for ip in ips: logger.debug(ip) ip_queue_data['ips'].append({ 'ip': ip, 'delay': 0, 'alive_time': 0, 'cunter': 0 }) TaskRecoder.add_ip_log(ip=ip, created_time=time.time()) self.ip_q.put(ip_queue_data, block=True, timeout=None) logger.debug(ip_queue_data) logger.debug('代理IP队列已经初始化完毕 共有%d个代理代理IP' % self.ip_q.qsize())