def update_headers(cls): cls.logger.info('*********updating cookies*********') _, headers, _ = parse_curl_str(cls.curl_str) headers['Cookie'] = cls.get_cookie_str() if headers['Cookie'] is None: change_ip() cls.update_headers() else: cls.headers = headers
def update_headers(self, changeip=True): if changeip: change_ip() r = get(self.base_url) h = cookie_dict_from_cookie_str(r.headers.get('Set-Cookie')) cookies_dict = cookie_dict_from_cookie_str(self.headers['Cookie']) cookies_dict.update(h) self.headers['Cookie'] = cookies_dict self.logger.info('headers: %s', pformat(self.headers))
def handle_response(self, url, response): self.logger.info('handle url: %s', url) if not response: return if response.status_code == 200: html = response.text html_parser = KuaidailiHtmlParser(url, html) ip_info_dict_yield = html_parser.parse() self.bulk_update_to_mongo(ip_info_dict_yield) elif response.status_code == 503: change_ip() self.urls.append(url) # retry
def handle_response(self, url, response): """handle_response 把代理ip的信息存储到mongodb中 :param url: :param response: requests.models.Response """ self.logger.info('handle url: %s', url) if not response: return if response.status_code == 200: html = response.text html_parser = XiciHtmlParser(url, html) ip_info_dict_yield = html_parser.parse() self.bulk_update_to_mongo(ip_info_dict_yield) elif response.status_code == 503: change_ip() self.urls.append(url) # retry
def fetch_channel_json(self, channel_json_url): time.sleep(random.randint(30, 60)) self.logger.info(channel_json_url) res = get(channel_json_url, headers=self.headers) # http://stackoverflow.com/questions/24027589/how-to-convert-raw-javascript-object-to-python-dictionary html = res.text.strip() o = ast.literal_eval(html) if not o: self.logger.info(pprint.pformat(html)) self.logger.info('fetch channel_json_url: %s failed', channel_json_url) change_ip() return nick_name = o['nick_name'] general_msg_list = o['general_msg_list'] article_list = ast.literal_eval(general_msg_list)['list'] article_dict_list = [] for article in article_list: app_msg_ext_info = article['app_msg_ext_info'] comm_msg_info = article['comm_msg_info'] ori_create_time = comm_msg_info['datetime'] article_dict_list.append( self._get_articel_info(app_msg_ext_info, nick_name, ori_create_time)) if app_msg_ext_info['is_multi']: for article_info in app_msg_ext_info[ 'multi_app_msg_item_list']: article_dict_list.append( self._get_articel_info(article_info, nick_name, ori_create_time)) article_dict_list = self.get_remove_too_old_days_article( article_dict_list) article_dict_list = self.get_remove_mongodb_already_has_article( nick_name, article_dict_list) for article_dict in article_dict_list: article_dict['link'] = self.get_permanent_wechat_article_url( article_dict['link']) self.logger.info(pprint.pformat(article_dict_list)) self.save_article_dict_list(nick_name, article_dict_list)
def fetch_channel_json(self, channel_json_url): time.sleep(random.randint(60, 120)) self.logger.info(channel_json_url) res = get(channel_json_url, headers=self.headers) # http://stackoverflow.com/questions/24027589/how-to-convert-raw-javascript-object-to-python-dictionary html = res.text.strip() o = ast.literal_eval(html) if not o: self.logger.debug(pprint.pformat(html)) self.logger.info( 'fetch channel_json_url: %s failed', channel_json_url ) change_ip() return nick_name = o['nick_name'] general_msg_list = o['general_msg_list'] article_list = ast.literal_eval(general_msg_list)['list'] article_dict_list = [] for article in article_list: app_msg_ext_info = article['app_msg_ext_info'] comm_msg_info = article['comm_msg_info'] ori_create_time = comm_msg_info['datetime'] article_dict_list.append( self._get_articel_info( app_msg_ext_info, nick_name, ori_create_time ) ) if app_msg_ext_info['is_multi']: for article_info in app_msg_ext_info['multi_app_msg_item_list']: article_dict_list.append( self._get_articel_info( article_info, nick_name, ori_create_time ) ) article_dict_list = self.get_remove_too_old_days_article(article_dict_list) article_dict_list = self.get_remove_mongodb_already_has_article(nick_name, article_dict_list) self.logger.info(pprint.pformat(article_dict_list)) self.save_article_dict_list(nick_name, article_dict_list)
def _wrapper(*args, **kwargs): index = 0 while index < retries: index += 1 try: response = func(*args, **kwargs) if response and ( LagouCrawler.is_block_html(response.text) or LagouCrawler.is_check_html(response.text) ): sleep_time = (sleep ** index + random.randint(1, 10)) if sleep_time > 300: # 5 mins change_ip() continue else: print('sleep for %ds' % sleep_time) time.sleep(sleep_time) continue if response.status_code in (301, 302, 404, 500): print('status_code', response.status_code) break elif response.status_code != 200: print(response.status_code) if changeip: change_ip() continue else: break except Exception as e: traceback.print_exc() response = None if isinstance(e, Timeout): if sleep is not None: time.sleep(sleep + random.randint(10, 15)) continue elif isinstance(e, TooManyRedirects): break return response