def fetch_ori_page(self, page_url): """拿到单个文章页面,在文章url里加上参数f=json可以直接得到 json格式的数据。 """ # 先拿到搜狗跳转到微信文章的地址 pre_r = get(page_url, headers=self.headers) wechat_url = pre_r.url.split('#')[0] + '&f=json' if 'mp.weixin' not in wechat_url: return r = get(wechat_url, headers=self.headers) self.logger.info(wechat_url) if self.col.find_one(dict(nick_name=self.name, url=wechat_url)): raise DocumentExistsException("article exist") if r.status_code != 200: return o = json.loads(r.text) self.col.update( { '_id': gid(), 'nick_name': self.name, 'url': wechat_url, }, {'$set': { 'json': o }}, upsert=True)
def search(self, retries=3): """搜索搜狗微信公众号并返回公众号文章列表页面,返回列表格式如下 http://weixin.sogou.com/gzh?openid=oIWsFt2uCBiQ3mWa2BSUtmdKD3gs&ext=p8lVKENENbkGdvuPNCqHoUqzuLEPtZheP6oyzp3YVsY_-OJEvXMz4yk2nJytyUxY """ query_url = 'http://weixin.sogou.com/weixin?type=1&' + urlencode({'query': self.name}) self.logger.info('query_url: %s', query_url) while retries > 0: self.logger.info('retry search %s %d' % (self.name, retries)) html = get(query_url, headers=self.headers).text soup = BeautifulSoup(html) a_tag_list = soup.find_all(attrs={'uigs': re.compile('account_name')}) href = None try: for a_tag in a_tag_list: if a_tag and a_tag.text.lower() == self.name.lower(): href = a_tag.get('href') break except Exception: self.logger.info('found %s failed' % self.name) continue if href is not None: break else: self.update_headers() time.sleep(random.randint(30, 60)) retries -= 1 res = href or None return res
def search(self, retries=3): """搜索搜狗微信公众号并返回公众号文章列表页面,返回列表格式如下 http://weixin.sogou.com/gzh?openid=oIWsFt2uCBiQ3mWa2BSUtmdKD3gs&ext=p8lVKENENbkGdvuPNCqHoUqzuLEPtZheP6oyzp3YVsY_-OJEvXMz4yk2nJytyUxY """ query_url = 'http://weixin.sogou.com/weixin?type=1&' + urlencode( {'query': self.name}) self.logger.info('query_url: %s', query_url) while retries > 0: self.logger.info('retry search %s %d' % (self.name, retries)) html = get(query_url, headers=self.headers).text soup = BeautifulSoup(html) a_tag_list = soup.find_all( attrs={'uigs': re.compile('account_name')}) href = None try: for a_tag in a_tag_list: if a_tag and a_tag.text.lower() == self.name.lower(): href = a_tag.get('href') break except Exception: self.logger.info('found %s failed' % self.name) continue if href is not None: break else: self.update_headers() time.sleep(random.randint(30, 60)) retries -= 1 res = href or None return res
def fetch_article_list(self, url, update=False): """ 微信号列表页面获取文章列表,返回json格式的数据,请求如下 http://weixin.sogou.com/gzhjs?openid=oIWsFt0qY9YvyYESHey3MOPfbNy0&ext=lA5I5al3X8CtYOmsUDOgMhZWHWk6xQhEnWXQ_8nrROTPnk351KTH-rcTJUTGDdZq&cb=sogou.weixin_gzhcb&page=3 """ query_url = 'http://weixin.sogou.com/gzhjs?cb=sogou.weixin_gzhcb&' query_dict = dict(urlparse.parse_qsl(urlparse.urlsplit(url).query)) while True: page = self.page if update and page > 2: page = 1 if not update and page > 10: break self.logger.info('抓取:%s page: %d' % (self.name, page)) query_dict['page'] = page json_url = query_url + urlencode(query_dict) json_str = get(json_url, headers=self.headers).text try: url_list, total_pages = self.parse_list_page(json_str.strip()) except Exception: traceback.print_exc() self.update_headers() continue if not url_list or page > min(10, total_pages): # 非登录只能抓100条 self.logger.info('%s爬取完毕' % self.name) break try: for page_url in url_list: time.sleep(random.randint(3, 10)) # sougou频率限制 self.logger.info(page_url) self.fetch_page(page_url) # self.fetch_ori_page(page_url) except (DocumentExistsException, DocumentExpireException): self.logger.info("更新完毕") break except Exception: traceback.print_exc() self.update_headers() continue page += 1 self.page = page
def fetch_channel_json(self, channel_json_url): time.sleep(random.randint(30, 60)) self.logger.info(channel_json_url) res = get(channel_json_url, headers=self.headers) # http://stackoverflow.com/questions/24027589/how-to-convert-raw-javascript-object-to-python-dictionary html = res.text.strip() o = ast.literal_eval(html) if not o: self.logger.info(pprint.pformat(html)) self.logger.info('fetch channel_json_url: %s failed', channel_json_url) change_ip() return nick_name = o['nick_name'] general_msg_list = o['general_msg_list'] article_list = ast.literal_eval(general_msg_list)['list'] article_dict_list = [] for article in article_list: app_msg_ext_info = article['app_msg_ext_info'] comm_msg_info = article['comm_msg_info'] ori_create_time = comm_msg_info['datetime'] article_dict_list.append( self._get_articel_info(app_msg_ext_info, nick_name, ori_create_time)) if app_msg_ext_info['is_multi']: for article_info in app_msg_ext_info[ 'multi_app_msg_item_list']: article_dict_list.append( self._get_articel_info(article_info, nick_name, ori_create_time)) article_dict_list = self.get_remove_too_old_days_article( article_dict_list) article_dict_list = self.get_remove_mongodb_already_has_article( nick_name, article_dict_list) for article_dict in article_dict_list: article_dict['link'] = self.get_permanent_wechat_article_url( article_dict['link']) self.logger.info(pprint.pformat(article_dict_list)) self.save_article_dict_list(nick_name, article_dict_list)
def fetch_channel_json(self, channel_json_url): time.sleep(random.randint(60, 120)) self.logger.info(channel_json_url) res = get(channel_json_url, headers=self.headers) # http://stackoverflow.com/questions/24027589/how-to-convert-raw-javascript-object-to-python-dictionary html = res.text.strip() o = ast.literal_eval(html) if not o: self.logger.debug(pprint.pformat(html)) self.logger.info( 'fetch channel_json_url: %s failed', channel_json_url ) change_ip() return nick_name = o['nick_name'] general_msg_list = o['general_msg_list'] article_list = ast.literal_eval(general_msg_list)['list'] article_dict_list = [] for article in article_list: app_msg_ext_info = article['app_msg_ext_info'] comm_msg_info = article['comm_msg_info'] ori_create_time = comm_msg_info['datetime'] article_dict_list.append( self._get_articel_info( app_msg_ext_info, nick_name, ori_create_time ) ) if app_msg_ext_info['is_multi']: for article_info in app_msg_ext_info['multi_app_msg_item_list']: article_dict_list.append( self._get_articel_info( article_info, nick_name, ori_create_time ) ) article_dict_list = self.get_remove_too_old_days_article(article_dict_list) article_dict_list = self.get_remove_mongodb_already_has_article(nick_name, article_dict_list) self.logger.info(pprint.pformat(article_dict_list)) self.save_article_dict_list(nick_name, article_dict_list)
def search(self, retries=3): """搜索搜狗微信公众号并返回公众号文章列表页面,返回列表格式如下 http://weixin.sogou.com/gzh?openid=oIWsFt2uCBiQ3mWa2BSUtmdKD3gs&ext=p8lVKENENbkGdvuPNCqHoUqzuLEPtZheP6oyzp3YVsY_-OJEvXMz4yk2nJytyUxY """ if not self.name: return if self.page > 10: self.logger.info("抓取前10页结束: %s" % self.name) return None query_url = 'http://weixin.sogou.com/weixin?type=1&' + \ urlencode({'query': self.name}) self.logger.info('query_url: %s', query_url) while retries > 0: self.logger.info('retry search %s %d' % (self.name, retries)) html = get(query_url, headers=self.headers).text soup = BeautifulSoup(html) item_tag_li = soup.find_all('div', class_="wx-rb bg-blue wx-rb_v1 _item") href = None try: for item_tag in item_tag_li: _href = item_tag.get('href') _title = item_tag.find(class_='txt-box').h3.text if (_title.strip() == self.name.strip() and '最近文章' in item_tag.get_text()): href = _href break except Exception: self.logger.info('found %s failed' % self.name) continue if href is not None: break else: self.update_headers() time.sleep(3) retries -= 1 res = href or None return res
def get_cookie_str(cls): """生成一个搜狗微信的cookie并返回 """ while True: time.sleep(5) url = 'http://weixin.sogou.com/weixin?query=%s' % \ random.choice('abcdefghijklmnopqrstuvwxyz') # 获取SNUID cookie = get(url, headers=cls.get_headers()) headers = cookie.headers try: cookie_str = headers.get('Set-Cookie') + '; ' + \ SougouWechat.getSUV() except Exception: cookie_str = None cls.logger.info('cookie_str: %s' % cookie_str) # 跳过没有设置SNUID的 if cookie_str and 'SUID' in cookie_str and 'SNUID' in cookie_str: return cookie_str
def get_cookie_str(cls): """生成一个搜狗微信的cookie并返回 """ while True: time.sleep(random.randint(30, 60)) url = 'http://weixin.sogou.com/weixin?query=%s' % \ random.choice('abcdefghijklmnopqrstuvwxyz') # 获取SNUID cookie = get(url, headers=cls.get_headers()) headers = cookie.headers try: cookie_str = headers.get('Set-Cookie') + '; ' + \ SougouWechat.getSUV() except Exception: cookie_str = None cls.logger.info('cookie_str: %s' % cookie_str) # 跳过没有设置SNUID的 if cookie_str and 'SUID' in cookie_str and 'SNUID' in cookie_str: return cookie_str
def fetch_parse(): url = 'http://cn.morningstar.com/handler/fundranking.ashx?date=2016-04-08&fund=&category=mix_radical&rating=&company=&cust=&sort=Return2Year&direction=desc&tabindex=1&pageindex=1&pagesize=10000&randomid=0.043611296370827723' html = get(url).text parse_html(html)
def fetch_page(self, page_url): """拿到单个文章页面,在文章url里加上参数f=json可以直接得到json格式 的数据,处理json拿到需要的字段。 """ if self.col.find(dict(nick_name=self.name)).count() > self.limit: oldest_doc = list( self.col.find(dict(nick_name=self.name)).sort([ ('ori_create_time', 1) ]).limit(1))[0] oldest_doc_id = oldest_doc.get('_id') self.col.remove({'_id': oldest_doc_id}) self.logger.info( "%s:删除:%s : %s\n" % (self.name, oldest_doc.get('title'), datestr_from_stamp(oldest_doc.get('ori_create_time'), '%Y-%m-%d'))) # 先拿到搜狗跳转到微信文章的地址 pre_r = get(page_url, headers=self.headers) wechat_url = pre_r.url.split('#')[0] + '&f=json' if 'mp.weixin' not in wechat_url: return r = get(wechat_url, headers=self.headers) self.logger.info(wechat_url) if self.col.find_one(dict(nick_name=self.name, url=wechat_url)): raise DocumentExistsException("article exist") if r.status_code != 200: return o = json.loads(r.text) if o.get('title') is None: # 文章被投诉后没有此字段,跳过 return fields = { 'cdn_url', 'nick_name', 'title', 'content', 'desc', 'link', 'ori_create_time' } media_fields = {'round_head_img', 'nick_name', 'signature'} media_dict = {k: o.get(k) for k in media_fields} article_dict = {k: o.get(k) for k in fields} if self.col.find_one(dict(nick_name=self.name, title=o['title'])): raise DocumentExistsException("article exist") too_old_days = 10 if days_from_now(o['ori_create_time']) > too_old_days: # 10天之前的跳过 self.logger.info('%s跳过%d天前文章 title : %s\n', self.name, too_old_days, o['title']) raise DocumentExpireException("expire") if o['title'] and o['content']: o_date = datestr_from_stamp(o.get('ori_create_time'), '%Y-%m-%d') self.logger.info('%s-保存文章 title : %s %s\n', self.name, o['title'], o_date) article_dict['nick_name'] = self.name article_dict['url'] = wechat_url article_dict['tag_id'] = self.tag_id del article_dict['content'] self.col.update({'_id': gid()}, {'$set': article_dict}, True) # http://mp.weixin.qq.com/s?__biz=MjM5NjAxMDc4MA==&mid=404900944&idx=1&sn=fe2d53ce562ee51e7163a60d4c95484a#rd biz = extract('__biz=', '==', article_dict['link']) self.media_col.update({'_id': biz}, {'$set': media_dict}, True)
def get(self, *args, **kwargs): return get(*args, **kwargs) # use web_util get