Esempio n. 1
0
def do(mongodb):

    print "Job proxy_cn_proxy_com start at %s!" % datetime.datetime.now(
    ).strftime("%Y-%m-%d %H:%M:%S")

    yield mongodb[collection_name].aggregate([{
        "$out":
        "%s_bak" % collection_name
    }]).to_list(length=None)
    yield mongodb[collection_name].remove({})

    for target_page_base in target_page_struct_list:

        start_page = int(target_page_base[1])
        start_page_url = construct_page_url_string(target_page_base[0],
                                                   start_page)

        start_page_html = yield crawler_page_html(start_page_url, True)
        start_page_html = etree.HTML(start_page_html)
        ip_list = grep_page_ip_list(start_page_html)

        ip_data = convert_ip_list_format(ip_list)

        yield save_to_db(mongodb, ip_data)

    # 验证代理ip是否有效
    yield validate.do(mongodb, collection_name, data_source)

    print "Job proxy_cn_proxy_com done at %s!" % datetime.datetime.now(
    ).strftime("%Y-%m-%d %H:%M:%S")
Esempio n. 2
0
def do(mongodb):

    print "Job proxy_kuaidaili_com start at %s!" % datetime.datetime.now(
    ).strftime("%Y-%m-%d %H:%M:%S")

    yield mongodb[collection_name].aggregate([{
        "$out":
        "%s_bak" % collection_name
    }]).to_list(length=None)
    yield mongodb[collection_name].remove({})

    for target_page_base in target_page_struct_list:

        start_page = int(target_page_base[1])
        start_page_url = construct_page_url_string(target_page_base[0],
                                                   start_page)

        start_page_html = yield crawler_page_html(start_page_url, True)
        start_page_html = etree.HTML(start_page_html)

        ip_list = grep_page_ip_list(start_page_html)
        ip_data = convert_ip_list_format(ip_list)
        yield save_to_db(mongodb, ip_data)

        end_page = int(grep_end_page(start_page_html))

        if end_page > target_page_base[2]:
            end_page = target_page_base[2]

        yield tornado_timmer.sleep(1)
        for page in range(start_page + 1, end_page + 1):
            page_url = construct_page_url_string(target_page_base[0], page)
            page_html = yield crawler_page_html(page_url, True)
            if not page_html:
                yield tornado_timmer.sleep(1)
                continue
            page_html = etree.HTML(page_html)

            ip_list = grep_page_ip_list(page_html)
            ip_data = convert_ip_list_format(ip_list)
            yield save_to_db(mongodb, ip_data)

            # 防屏蔽,请求降频 | 或者使用代理提高频率
            yield tornado_timmer.sleep(5)

    ## 验证代理ip是否有效
    yield validate.do(mongodb, collection_name, data_source)

    print "Job proxy_kuaidaili_com done at %s!" % datetime.datetime.now(
    ).strftime("%Y-%m-%d %H:%M:%S")