def fetch_direct_url(cache_url, proxies=None, headers=None): """获取快照真实链接 :param cache_url: 快照链接 :param proxies: 代理,同 Requests 参数要求 :param headers: 请求头,同 Requests 参数要求 :return: 快照跳转链接 """ response_list = url_request(cache_url, [dict()], proxies=proxies, headers=headers) try: if response_list[0].status_code == requests.codes.ok: content = response_list[0].content.decode('utf-8', errors='ignore') elif response_list[0].status_code == 400: return None else: response_list[0].raise_for_status() except Exception as e: exception_raise(e, msg=cache_url) try: direct_url = re.search(REGEX_DIRECT_URL, content) if direct_url: if 'c.360webcache.com' in direct_url.group(1): return direct_url.group(1) else: return content else: return False except Exception as _: exception_raise( TypeError('fetch_direct_url() error {}'.format(cache_url)))
def get_response(urls, method='GET', data=None, proxies=None, headers=None): """执行页面请求 :param urls: URL 列表,同 Requests 参数要求 :param method: 默认为 ‘GET’ 方式请求,同 Requests 参数要求 :param data: 为 ‘POST’ 时所携带的参数,同 Requests 参数要求 :param proxies: 代理,同 Requests 参数要求 :param headers: 请求头,同 Requests 参数要求 :return: Class Response from Requests lib """ request_list = [] for i, url in enumerate(urls): try: resp = requests.request(method=method, url=url, proxies=proxies, data=data[i] if data else None, timeout=10, allow_redirects=False, headers=headers, verify=False) request_list.append(resp) except Exception as e: exception_raise(e, msg=url) return request_list
def fetch_info_cache(items_list, proxies=None): """发起快照查询 :param items_list: 敏感词过滤后的信息字典 :param proxies: 代理,同 Requests 参数要求 :return: 快照 html 内容列表 """ if items_list is not False: if items_list == list(): return list() else: return False cache_url_list = [item['cached_url'] for item in items_list] cache_content_list = list() for cached_url in cache_url_list: if cached_url is None: cache_content_list.append(None) continue # 目前已知'百度知道'可以直接获取原网站链接,因此添加'so.com'过滤规则 if 'so.com' not in cached_url: cache_content_list.append( FORMAT_REDIRECT_SCRIPT.format(cached_url, cached_url)) continue direct_url = fetch_direct_url(cached_url, proxies=proxies, headers=SO_HEADERS) if direct_url is None: cache_content_list.append(None) continue elif direct_url[:4] != 'http': cache_content_list.append(direct_url) continue # TODO: 快照跳转到原网站的情况则不需要进行第二次请求 elif direct_url is False: exception_raise( TypeError('cannot get direct_url').format(cached_url)) cache_headers = copy.deepcopy(SO_HEADERS) cache_headers.update(Referer=cached_url) cache_content = fetch_cache_content(direct_url, proxies=proxies, headers=cache_headers) if cache_content is None: cache_content_list.append(None) continue cache_content_list.append(cache_content) return cache_content_list
def info_parse(page_dict, page): """发起页面解析 :param page_dict: 返回信息字典 :param page: 页数 :return: 匹配后的字典列表 """ if page_dict is not False: if page_dict == dict(): return list() else: return False try: page_parse_list = page_parse(page_dict, page) return page_parse_list except Exception as e: exception_raise(e, msg=page_dict)
def page_parse(page_dict, page, top_size='max'): """解析页面内容 :param page_dict: 返回页面信息字典 :param page: 页数 :param top_size: 解析页面的前 N 个条目, 默认为前 10 条,等于“max”时,整页解析 :return: 匹配后的字典或 False """ page_result_dict = copy.deepcopy(page_dict) try: content = page_result_dict['content'].content.decode('utf-8', errors='ignore') except Exception as e: exception_raise(e, msg=page_result_dict) items_list = re_message(REGEX_PAGE_ITEM, content) page_map_list = list() if not items_list: # TODO: 无法解析和号码类分类 return list() else: # 号码类查询去掉页面第一个条目(号码查询应用),公司号码的号码应用如有所有人则保留 if page_result_dict['query_type'] in ID_CARD_MAP_KEY + PHONE_MAP_KEY: if page_result_dict['query_type'] in COMPANY_PHONE_KEY: if not re.search(REGEX_IMG_SCR, items_list[0]) or page != 1: items_list = items_list[1:] else: items_list = items_list[1:] # 可设置只解析一个页面的前 N 个条目 if top_size == 'max': top_size = len(items_list) for i, item in enumerate(items_list[:top_size]): title, summary, cached_url, source, post_time = \ get_result_info(item) if title: page_map_list.append( dict(title=title, summary=summary, source=source, sequence_record=i, cached_url=cached_url, post_time=post_time, query_platform='好搜', cache_status=True, query_term=page_result_dict['query_term'], query_type=page_result_dict['query_type'])) return page_map_list
def fetch_cache_content(direct_url, proxies=None, headers=None): """获取快照真实内容 :param direct_url: 快照跳转链接 :param proxies: 代理,同 Requests 参数要求 :param headers: 请求头,同 Requests 参数要求 :return: 快照 html 内容 """ response_list = url_request(direct_url, [dict()], proxies=proxies, headers=headers) try: if response_list[0].status_code == requests.codes.ok: return response_list[0].content.decode('utf-8', errors='ignore') elif response_list[0].status_code == 302: return None else: response_list[0].raise_for_status() except Exception as e: exception_raise(e, msg=direct_url)
def keyword_query(keyword_dict, page=1, proxies=None, headers=None): """查询关键词 :param keyword_dict: 查询参数字典 :param page: 页数 :param proxies: 代理,同 Requests 参数要求 :param headers: 请求头,同 Requests 参数要求 :return: Requests Object dict """ keyword_query_dict = copy.deepcopy(keyword_dict) query_type = keyword_query_dict['query_type'] if query_type in CHECK_TIME_MAP: query_term_list = [ dict(query_term=keyword_query_dict['query_term'], page=page, time=CHECK_TIME_MAP[query_type]) ] query_url = HAOSO_URL_WITH_TIME else: return False content_list = url_request(query_url, query_term_list, proxies=proxies, headers=headers) try: if content_list[0].status_code == requests.codes.ok: if not re.search( REGEX_PAGE_EXCEPTION, content_list[0].content.decode( 'utf-8', errors='ignore')): keyword_query_dict.update(content=content_list[0]) return keyword_query_dict else: print('[PAGE EXCEPTION]:{}, {}, {}'.format( keyword_dict, page, proxies)) exception_raise(HTTPError('keyword_query(): PAGE EXCEPTION'), code=102) else: content_list[0].raise_for_status() except Exception as e: exception_raise(e, msg=keyword_dict)
def main(**kwargs): """主函数 :param kwargs: 传入参数,proxies,data, page... :return: 内容字典 """ try: proxies = kwargs.get('proxies') data = kwargs.get('data') page = kwargs.get('page') # 接收参数 info_dict = dict(query_type=data.keys()[0], query_term=data[data.keys()[0]]) # 下载页面 page_dict = info_query(info_dict, page, proxies) if page_dict is False: exception_raise(TypeError('info_query() error')) # 页面解析 # TODO: 最终结果进行过滤,不再利用 page 来进行判断 items_list = info_parse(page_dict, page) if items_list is False: exception_raise(TypeError('info_parse() error')) # 精确匹配 identical_list = info_identical(items_list) if identical_list is False: exception_raise(TypeError('info_identical() error')) # 过滤敏感词 filter_list = info_filter(identical_list) if filter_list is False: exception_raise(TypeError('info_filter() error')) # return (dict(haosoukeywordinfo=filter_list)) # 获取 html cache_list = fetch_info_cache(filter_list, proxies) if cache_list is False: exception_raise(TypeError('fetch_info_cache() error')) return (dict(haosoukeywordinfo=cache_list), dict(haosoukeywordinfo=filter_list)) except Exception as e: if hasattr(e, 'uf_errno') and e.uf_errno: raise exception_raise(e, code=101)