def do(mongodb): print "Job proxy_cn_proxy_com start at %s!" % datetime.datetime.now( ).strftime("%Y-%m-%d %H:%M:%S") yield mongodb[collection_name].aggregate([{ "$out": "%s_bak" % collection_name }]).to_list(length=None) yield mongodb[collection_name].remove({}) for target_page_base in target_page_struct_list: start_page = int(target_page_base[1]) start_page_url = construct_page_url_string(target_page_base[0], start_page) start_page_html = yield crawler_page_html(start_page_url, True) start_page_html = etree.HTML(start_page_html) ip_list = grep_page_ip_list(start_page_html) ip_data = convert_ip_list_format(ip_list) yield save_to_db(mongodb, ip_data) # 验证代理ip是否有效 yield validate.do(mongodb, collection_name, data_source) print "Job proxy_cn_proxy_com done at %s!" % datetime.datetime.now( ).strftime("%Y-%m-%d %H:%M:%S")
def do(mongodb): print "Job proxy_kuaidaili_com start at %s!" % datetime.datetime.now( ).strftime("%Y-%m-%d %H:%M:%S") yield mongodb[collection_name].aggregate([{ "$out": "%s_bak" % collection_name }]).to_list(length=None) yield mongodb[collection_name].remove({}) for target_page_base in target_page_struct_list: start_page = int(target_page_base[1]) start_page_url = construct_page_url_string(target_page_base[0], start_page) start_page_html = yield crawler_page_html(start_page_url, True) start_page_html = etree.HTML(start_page_html) ip_list = grep_page_ip_list(start_page_html) ip_data = convert_ip_list_format(ip_list) yield save_to_db(mongodb, ip_data) end_page = int(grep_end_page(start_page_html)) if end_page > target_page_base[2]: end_page = target_page_base[2] yield tornado_timmer.sleep(1) for page in range(start_page + 1, end_page + 1): page_url = construct_page_url_string(target_page_base[0], page) page_html = yield crawler_page_html(page_url, True) if not page_html: yield tornado_timmer.sleep(1) continue page_html = etree.HTML(page_html) ip_list = grep_page_ip_list(page_html) ip_data = convert_ip_list_format(ip_list) yield save_to_db(mongodb, ip_data) # 防屏蔽,请求降频 | 或者使用代理提高频率 yield tornado_timmer.sleep(5) ## 验证代理ip是否有效 yield validate.do(mongodb, collection_name, data_source) print "Job proxy_kuaidaili_com done at %s!" % datetime.datetime.now( ).strftime("%Y-%m-%d %H:%M:%S")