Example #1
0
def db_proxy():
    data = []
    proxies = Proxy_IP.select().where(Proxy_IP.type == 'https').order_by(
        Proxy_IP.timestamp)
    for proxy in proxies:
        r_times = int(proxy.right_times)
        a_times = int(proxy.all_times)
        success_rate = r_times * 1.0 / a_times
        ip_and_port = proxy.ip_and_port
        httptype = proxy.type
        proxyurl = httptype + "://" + ip_and_port
        logger.info("db proxyurl is {}".format(proxyurl))
        fetch_result = fetch(url=fetch_url, proxy=proxyurl, proxy_type='https')
        response = fetch_result['response_status_code']
        retry_num = fetch_result['retry_num']
        retry_success_rate = retry_num * 1.0 / RETRY_NUM
        # 总成功率超过60%,最近一个时刻尝试2次(总重试次数为3)就成功的代理
        if success_rate > 0.6 and response == 200 and retry_success_rate < 0.7:
            update_proxy_score(proxy, res=1)
            one_proxy_data_dic = {
                "proxy": proxyurl,
                "proxy_scheme": proxy.type
            }
            data.append(one_proxy_data_dic)
            logger.info("from db add proxyinfo:{} ".format(one_proxy_data_dic))
        # 成功率低于30%的代理在DB中减少成功次数,成功次数低于0则删除记录
        else:
            logger.info(
                "proxy success is too low, proxy info:{}, latest response_status_code:{}"
                .format(proxyurl, response))
            # delete_proxy_from_db(proxy)
            update_proxy_score(proxy)
    return data
Example #2
0
 def __init__(self):
     super(XicidailiSpider, self).__init__()
     urls = [
         "http://www.xicidaili.com/wn/{}".format(k) for k in range(1, 100)
     ]
     for url in urls:
         self.url_list.put(url)
     self.proxypool = Proxy_IP.select().where(Proxy_IP.type == 'http')
Example #3
0
 def GET(self):
     get_input = web.input(_method='get')
     query_country = query_anonymity = query_number = query_type = None
     try:
         query_country = get_input.country
     except:
         pass
     try:
         query_anonymity = get_input.anonymity
     except:
         pass
     try:
         query_number = get_input.number
     except:
         pass
     try:
         query_type = get_input.type
     except:
         pass
     proxies = Proxy_IP.select().order_by(Proxy_IP.timestamp)
     updatetime = str(proxies[0].timestamp).split('.')[0]
     data = []
     anonymity_level = {
         "transparent": 0,
         "anonymity": 1,
         "normal_anonymity": 1,
         "high_anonymity": 2
     }
     for proxy in proxies:
         if query_country:
             if proxy.country != query_country:
                 continue
         if query_type:
             if proxy.type != query_type:
                 continue
         if query_anonymity:
             print(query_anonymity)
             if anonymity_level[
                     proxy.anonymity] < anonymity_level[query_anonymity]:
                 continue
         one_proxy_data_dic = {
             "ip_and_port": proxy.ip_and_port,
             "country": proxy.country,
             "type": proxy.type,
             "anonymity": proxy.anonymity,
             "round_trip_time": proxy.round_trip_time
         }
         data.append(one_proxy_data_dic)
         if query_number:
             if query_number < len(data):
                 data = data[0:query_number]
     return_dic = {"num": len(data), "updatetime": updatetime, "data": data}
     return json.dumps(return_dic)
Example #4
0
def json_proxy():
    data = []
    jsonfile = open(jsonpath, encoding='utf-8')
    proxylist = json.load(jsonfile)
    jsonfile.close()
    if proxylist:
        for proxy in proxylist:
            proxyurl = proxy['proxy']
            # 端口是3888的为私有代理
            pattern = ':3888$'
            if not re.search(pattern, proxyurl):
                # if proxyurl != "http://192.168.88.176:3888":
                fetch_result = fetch(url=fetch_url,
                                     proxy=proxyurl,
                                     proxy_type='https')
                response = fetch_result['response_status_code']
                # 查询代理IP是否在DB中
                ip_and_port = proxyurl.split('/')[-1]
                httptype = proxyurl.split(':')[0]
                proxies = Proxy_IP.select().where(
                    Proxy_IP.ip_and_port == ip_and_port,
                    Proxy_IP.type == httptype).first()
                # print("proxies", proxies)
                # 构建对象
                proxyinfo = Proxy_IP(ip_and_port=ip_and_port)
                proxyinfo.ip_and_port = ip_and_port
                proxyinfo.timestamp = datetime.datetime.now()

                if proxies:
                    # IP在DB中
                    if response == 200:
                        update_proxy_score(proxyinfo, res=1)
                        data.append(proxy)
                        logger.info(
                            "from jsonfile add proxyinfo:{} ".format(proxy))
                    else:
                        update_proxy_score(proxyinfo)
                        logger.info(
                            "proxy response is not 200, cancel from jsonfile, proxy info:{} "
                            .format(proxy))
                else:
                    # IP不在DB中
                    proxyinfo.type = 'https'
                    proxyinfo.anonymity = 'high_anonymity'
                    proxyinfo.round_trip_time = '1'
                    proxyinfo.country = 'China'
                    proxyinfo.all_times = '1'
                    proxyinfo.timestamp = datetime.datetime.now()
                    if response == 200:
                        proxyinfo.right_times = '1'
                        save_proxy_to_db(proxyinfo)
                        data.append(proxy)
                        logger.info(
                            "from jsonfile add proxyinfo:{} ".format(proxy))
                    else:
                        proxyinfo.right_times = '1'
                        save_proxy_to_db(proxyinfo)
                        logger.info(
                            "proxy response is not 200, cancel from jsonfile, proxy info:{} "
                            .format(proxy))
    return data
Example #5
0
        response.encoding = 'utf-8'
        html = response.text
        if "豆瓣读书,新书速递,畅销书,书评,书单" in html:
            proxy.round_trip_time = fetch_result['round_trip_time']
            save_proxy_to_db(proxy)
        else:
            if self.recheck:
                delete_proxy_from_db(proxy)
            return

    def _check_one_proxy(self, proxy):
        if proxy.type == 'http':
            self._check_one_http_proxy(proxy)
        else:
            self._check_one_https_proxy(proxy)

    def run(self, ):
        for proxy in self.proxies:
            self.pool.spawn(self._check_one_proxy, proxy)
        self.pool.join()


if __name__ == "__main__":
    logger.info("-------Recheck Start-------")
    check_proxy = Check_proxy()
    check_proxy.recheck = True
    proxies = Proxy_IP.select()
    check_proxy.proxies.extend(proxies)
    check_proxy.run()
    logger.info("-------Recheck Finish-------")