Beispiel #1
0
def fetch_direct_url(cache_url, proxies=None, headers=None):
    """获取快照真实链接

    :param cache_url: 快照链接
    :param proxies: 代理,同 Requests 参数要求
    :param headers: 请求头,同 Requests 参数要求
    :return: 快照跳转链接
    """
    response_list = url_request(cache_url, [dict()],
                                proxies=proxies,
                                headers=headers)
    try:
        if response_list[0].status_code == requests.codes.ok:
            content = response_list[0].content.decode('utf-8', errors='ignore')
        elif response_list[0].status_code == 400:
            return None
        else:
            response_list[0].raise_for_status()
    except Exception as e:
        exception_raise(e, msg=cache_url)
    try:
        direct_url = re.search(REGEX_DIRECT_URL, content)
        if direct_url:
            if 'c.360webcache.com' in direct_url.group(1):
                return direct_url.group(1)
            else:
                return content
        else:
            return False
    except Exception as _:
        exception_raise(
            TypeError('fetch_direct_url() error {}'.format(cache_url)))
Beispiel #2
0
def get_response(urls, method='GET', data=None, proxies=None, headers=None):
    """执行页面请求

    :param urls: URL 列表,同 Requests 参数要求
    :param method: 默认为 ‘GET’ 方式请求,同 Requests 参数要求
    :param data: 为 ‘POST’ 时所携带的参数,同 Requests 参数要求
    :param proxies: 代理,同 Requests 参数要求
    :param headers: 请求头,同 Requests 参数要求
    :return: Class Response from Requests lib
    """
    request_list = []
    for i, url in enumerate(urls):
        try:
            resp = requests.request(method=method,
                                    url=url,
                                    proxies=proxies,
                                    data=data[i] if data else None,
                                    timeout=10,
                                    allow_redirects=False,
                                    headers=headers,
                                    verify=False)
            request_list.append(resp)
        except Exception as e:
            exception_raise(e, msg=url)
    return request_list
Beispiel #3
0
def fetch_info_cache(items_list, proxies=None):
    """发起快照查询

    :param items_list: 敏感词过滤后的信息字典
    :param proxies: 代理,同 Requests 参数要求
    :return: 快照 html 内容列表
    """
    if items_list is not False:
        if items_list == list():
            return list()
    else:
        return False
    cache_url_list = [item['cached_url'] for item in items_list]
    cache_content_list = list()
    for cached_url in cache_url_list:
        if cached_url is None:
            cache_content_list.append(None)
            continue
        # 目前已知'百度知道'可以直接获取原网站链接,因此添加'so.com'过滤规则
        if 'so.com' not in cached_url:
            cache_content_list.append(
                FORMAT_REDIRECT_SCRIPT.format(cached_url, cached_url))
            continue
        direct_url = fetch_direct_url(cached_url,
                                      proxies=proxies,
                                      headers=SO_HEADERS)
        if direct_url is None:
            cache_content_list.append(None)
            continue
        elif direct_url[:4] != 'http':
            cache_content_list.append(direct_url)
            continue
        # TODO: 快照跳转到原网站的情况则不需要进行第二次请求
        elif direct_url is False:
            exception_raise(
                TypeError('cannot get direct_url').format(cached_url))
        cache_headers = copy.deepcopy(SO_HEADERS)
        cache_headers.update(Referer=cached_url)
        cache_content = fetch_cache_content(direct_url,
                                            proxies=proxies,
                                            headers=cache_headers)
        if cache_content is None:
            cache_content_list.append(None)
            continue
        cache_content_list.append(cache_content)
    return cache_content_list
Beispiel #4
0
def info_parse(page_dict, page):
    """发起页面解析

    :param page_dict: 返回信息字典
    :param page: 页数
    :return: 匹配后的字典列表
    """
    if page_dict is not False:
        if page_dict == dict():
            return list()
    else:
        return False
    try:
        page_parse_list = page_parse(page_dict, page)
        return page_parse_list
    except Exception as e:
        exception_raise(e, msg=page_dict)
Beispiel #5
0
def page_parse(page_dict, page, top_size='max'):
    """解析页面内容

    :param page_dict: 返回页面信息字典
    :param page: 页数
    :param top_size: 解析页面的前 N 个条目, 默认为前 10 条,等于“max”时,整页解析
    :return: 匹配后的字典或 False
    """
    page_result_dict = copy.deepcopy(page_dict)
    try:
        content = page_result_dict['content'].content.decode('utf-8',
                                                             errors='ignore')
    except Exception as e:
        exception_raise(e, msg=page_result_dict)
    items_list = re_message(REGEX_PAGE_ITEM, content)
    page_map_list = list()
    if not items_list:
        # TODO: 无法解析和号码类分类
        return list()
    else:
        # 号码类查询去掉页面第一个条目(号码查询应用),公司号码的号码应用如有所有人则保留
        if page_result_dict['query_type'] in ID_CARD_MAP_KEY + PHONE_MAP_KEY:
            if page_result_dict['query_type'] in COMPANY_PHONE_KEY:
                if not re.search(REGEX_IMG_SCR, items_list[0]) or page != 1:
                    items_list = items_list[1:]
            else:
                items_list = items_list[1:]
        # 可设置只解析一个页面的前 N 个条目
        if top_size == 'max':
            top_size = len(items_list)
        for i, item in enumerate(items_list[:top_size]):
            title, summary, cached_url, source, post_time = \
                get_result_info(item)
            if title:
                page_map_list.append(
                    dict(title=title,
                         summary=summary,
                         source=source,
                         sequence_record=i,
                         cached_url=cached_url,
                         post_time=post_time,
                         query_platform='好搜',
                         cache_status=True,
                         query_term=page_result_dict['query_term'],
                         query_type=page_result_dict['query_type']))
    return page_map_list
Beispiel #6
0
def fetch_cache_content(direct_url, proxies=None, headers=None):
    """获取快照真实内容

    :param direct_url: 快照跳转链接
    :param proxies: 代理,同 Requests 参数要求
    :param headers: 请求头,同 Requests 参数要求
    :return: 快照 html 内容
    """
    response_list = url_request(direct_url, [dict()],
                                proxies=proxies,
                                headers=headers)
    try:
        if response_list[0].status_code == requests.codes.ok:
            return response_list[0].content.decode('utf-8', errors='ignore')
        elif response_list[0].status_code == 302:
            return None
        else:
            response_list[0].raise_for_status()
    except Exception as e:
        exception_raise(e, msg=direct_url)
Beispiel #7
0
def keyword_query(keyword_dict, page=1, proxies=None, headers=None):
    """查询关键词

    :param keyword_dict: 查询参数字典
    :param page: 页数
    :param proxies: 代理,同 Requests 参数要求
    :param headers: 请求头,同 Requests 参数要求
    :return: Requests Object dict
    """
    keyword_query_dict = copy.deepcopy(keyword_dict)
    query_type = keyword_query_dict['query_type']
    if query_type in CHECK_TIME_MAP:
        query_term_list = [
            dict(query_term=keyword_query_dict['query_term'],
                 page=page,
                 time=CHECK_TIME_MAP[query_type])
        ]
        query_url = HAOSO_URL_WITH_TIME
    else:
        return False
    content_list = url_request(query_url,
                               query_term_list,
                               proxies=proxies,
                               headers=headers)
    try:
        if content_list[0].status_code == requests.codes.ok:
            if not re.search(
                    REGEX_PAGE_EXCEPTION, content_list[0].content.decode(
                        'utf-8', errors='ignore')):
                keyword_query_dict.update(content=content_list[0])
                return keyword_query_dict
            else:
                print('[PAGE EXCEPTION]:{}, {}, {}'.format(
                    keyword_dict, page, proxies))
                exception_raise(HTTPError('keyword_query(): PAGE EXCEPTION'),
                                code=102)
        else:
            content_list[0].raise_for_status()
    except Exception as e:
        exception_raise(e, msg=keyword_dict)
Beispiel #8
0
def main(**kwargs):
    """主函数

    :param kwargs: 传入参数,proxies,data, page...
    :return: 内容字典
    """
    try:
        proxies = kwargs.get('proxies')
        data = kwargs.get('data')
        page = kwargs.get('page')
        # 接收参数
        info_dict = dict(query_type=data.keys()[0],
                         query_term=data[data.keys()[0]])
        # 下载页面
        page_dict = info_query(info_dict, page, proxies)
        if page_dict is False:
            exception_raise(TypeError('info_query() error'))
        # 页面解析
        # TODO: 最终结果进行过滤,不再利用 page 来进行判断
        items_list = info_parse(page_dict, page)
        if items_list is False:
            exception_raise(TypeError('info_parse() error'))
        # 精确匹配
        identical_list = info_identical(items_list)
        if identical_list is False:
            exception_raise(TypeError('info_identical() error'))
        # 过滤敏感词
        filter_list = info_filter(identical_list)
        if filter_list is False:
            exception_raise(TypeError('info_filter() error'))
        # return (dict(haosoukeywordinfo=filter_list))
        # 获取 html
        cache_list = fetch_info_cache(filter_list, proxies)
        if cache_list is False:
            exception_raise(TypeError('fetch_info_cache() error'))
        return (dict(haosoukeywordinfo=cache_list),
                dict(haosoukeywordinfo=filter_list))
    except Exception as e:
        if hasattr(e, 'uf_errno') and e.uf_errno:
            raise
        exception_raise(e, code=101)