Exemple #1
0
def delete_proxy_from_db(proxy):
    try:
        saved_proxy = Proxy_IP.get(Proxy_IP.ip_and_port == proxy.ip_and_port)
        if saved_proxy.delete_instance() == 1:
            logger.info("{} deleted from database".format(proxy))
    except DoesNotExist:
        pass
Exemple #2
0
def db_proxy():
    data = []
    proxies = Proxy_IP.select().where(Proxy_IP.type == 'https').order_by(
        Proxy_IP.timestamp)
    for proxy in proxies:
        r_times = int(proxy.right_times)
        a_times = int(proxy.all_times)
        success_rate = r_times * 1.0 / a_times
        ip_and_port = proxy.ip_and_port
        httptype = proxy.type
        proxyurl = httptype + "://" + ip_and_port
        logger.info("db proxyurl is {}".format(proxyurl))
        fetch_result = fetch(url=fetch_url, proxy=proxyurl, proxy_type='https')
        response = fetch_result['response_status_code']
        retry_num = fetch_result['retry_num']
        retry_success_rate = retry_num * 1.0 / RETRY_NUM
        # 总成功率超过60%,最近一个时刻尝试2次(总重试次数为3)就成功的代理
        if success_rate > 0.6 and response == 200 and retry_success_rate < 0.7:
            update_proxy_score(proxy, res=1)
            one_proxy_data_dic = {
                "proxy": proxyurl,
                "proxy_scheme": proxy.type
            }
            data.append(one_proxy_data_dic)
            logger.info("from db add proxyinfo:{} ".format(one_proxy_data_dic))
        # 成功率低于30%的代理在DB中减少成功次数,成功次数低于0则删除记录
        else:
            logger.info(
                "proxy success is too low, proxy info:{}, latest response_status_code:{}"
                .format(proxyurl, response))
            # delete_proxy_from_db(proxy)
            update_proxy_score(proxy)
    return data
Exemple #3
0
def update_proxy_score(proxy, res=0):
    try:
        saved_proxy = Proxy_IP.get(Proxy_IP.ip_and_port == proxy.ip_and_port)
        all_times = int(saved_proxy.all_times)
        right_times = int(saved_proxy.right_times)
        saved_proxy.all_times = str(all_times + 1)
        saved_proxy.timestamp = datetime.datetime.now()
        # 计算重试过程中代理成功的次数
        if res:
            saved_proxy.right_times = str(right_times + 1)
        else:
            saved_proxy.right_times = str(right_times - 1)
        # 根据成功次数判断对代理的操作
        if int(saved_proxy.right_times) <= 0:
            # 执行删除记录操作
            if saved_proxy.delete_instance() == 1:
                logger.info(
                    "instability proxy:{} deleted from database".format(proxy))
            else:
                logger.info("delete fail, nstability proxy:{}".format(proxy))
        else:
            if saved_proxy.save() == 1:
                logger.info("{} update from database, new all_times:{}, new right_times:{}"\
                        .format(proxy, saved_proxy.all_times, saved_proxy.right_times))
    except DoesNotExist:
        proxy.all_times = '1'
        proxy.right_times = '0'
        proxy.timestamp = datetime.datetime.now()
        if proxy.save() == 1:
            logger.info("{} saved into database".format(proxy))
Exemple #4
0
 def __init__(self):
     super(XicidailiSpider, self).__init__()
     urls = [
         "http://www.xicidaili.com/wn/{}".format(k) for k in range(1, 100)
     ]
     for url in urls:
         self.url_list.put(url)
     self.proxypool = Proxy_IP.select().where(Proxy_IP.type == 'http')
Exemple #5
0
def save_proxy_to_db(proxy):
    try:
        saved_proxy = Proxy_IP.get(Proxy_IP.ip_and_port == proxy.ip_and_port)
        saved_proxy.round_trip_time = proxy.round_trip_time
        saved_proxy.anonymity = proxy.anonymity
        saved_proxy.country = proxy.country
        saved_proxy.timestamp = datetime.datetime.now()
        if saved_proxy.save() == 1:
            logger.info("{} updated into database".format(saved_proxy))
    except DoesNotExist:
        if proxy.save() == 1:
            logger.info("{} saved into database".format(proxy))
Exemple #6
0
 def parse_ip_proxy(self, url):
     fetch_result = fetch(url)
     response = fetch_result['response']
     response.encoding = 'gbk'
     html = response.text
     soup = BeautifulSoup(html, "html5lib")
     trs = soup.find('div', id="main").find('tbody').find_all('tr')[1:]
     for tr in trs:
         tds = tr.find_all('td')
         ip_and_port = tds[0].string + ":" + tds[1].string
         self.proxy_list.add(Proxy_IP(ip_and_port=ip_and_port))
         logger.info(self.__class__.__name__ + " " + ip_and_port)
Exemple #7
0
 def GET(self):
     get_input = web.input(_method='get')
     query_country = query_anonymity = query_number = query_type = None
     try:
         query_country = get_input.country
     except:
         pass
     try:
         query_anonymity = get_input.anonymity
     except:
         pass
     try:
         query_number = get_input.number
     except:
         pass
     try:
         query_type = get_input.type
     except:
         pass
     proxies = Proxy_IP.select().order_by(Proxy_IP.timestamp)
     updatetime = str(proxies[0].timestamp).split('.')[0]
     data = []
     anonymity_level = {
         "transparent": 0,
         "anonymity": 1,
         "normal_anonymity": 1,
         "high_anonymity": 2
     }
     for proxy in proxies:
         if query_country:
             if proxy.country != query_country:
                 continue
         if query_type:
             if proxy.type != query_type:
                 continue
         if query_anonymity:
             print(query_anonymity)
             if anonymity_level[
                     proxy.anonymity] < anonymity_level[query_anonymity]:
                 continue
         one_proxy_data_dic = {
             "ip_and_port": proxy.ip_and_port,
             "country": proxy.country,
             "type": proxy.type,
             "anonymity": proxy.anonymity,
             "round_trip_time": proxy.round_trip_time
         }
         data.append(one_proxy_data_dic)
         if query_number:
             if query_number < len(data):
                 data = data[0:query_number]
     return_dic = {"num": len(data), "updatetime": updatetime, "data": data}
     return json.dumps(return_dic)
Exemple #8
0
 def parse_ip_proxy(self, url):
     proxy = random.choice(self.proxypool)
     fetch_result = fetch(url, proxy)
     response = fetch_result['response']
     if not response:
         logger.info('response is None , url:{}, proxy:{}'.format(
             url, proxy))
         return
     response.encoding = 'utf-8'
     response_status_code = response.status_code
     print('response is status_code:{}, url:{}, proxy:{}'.format(
         response_status_code, url, proxy))
     html = response.text
     soup = BeautifulSoup(html, "html5lib")
     trs = soup.find('table', id="ip_list").find('tbody').find_all('tr')[1:]
     for tr in trs:
         tds = tr.find_all('td')
         ip_and_port = tds[1].string + ":" + tds[2].string
         # proxy = Proxy_IP(ip_and_port=ip_and_port, type='https')
         proxy = Proxy_IP(ip_and_port=ip_and_port)
         if tds[4].string == '高匿':
             proxy.anonymity = 'high_anonymity'
         elif tds[4].string == '透明':
             proxy.anonymity = 'transparent'
         proxy.country = 'China'
         httptype = tds[5].string
         if httptype == 'HTTPS':
             proxy.type = 'https'
             self.proxy_list.add(proxy)
         logger.info(self.__class__.__name__ + " " + ip_and_port + " " +
                     proxy.anonymity)
 def parse_ip_proxy(self, response):
     html = response.text
     for proxy in re.findall(IP_PROXY_REGEX, html):
         self.proxy_list.add(Proxy_IP(ip_and_port=proxy[0]))
         logger.info(self.__class__.__name__ + " " + proxy[0])
Exemple #10
0
    response = None
    response_status_code = None
    retry_num = start = end = 0
    for i in range(RETRY_NUM):
        try:
            if proxy is not None:
                kwargs["proxies"] = {proxy_type: str(proxy)}
            start = time.time()
            response = requests.get(url, **kwargs)
            end = time.time()
            if response:
                response_status_code = response.status_code
            break
        except Exception as e:
            time.sleep(1)
            retry_num += 1
            continue
    return {
        "response": response,
        "retry_num": retry_num,
        "round_trip_time": round((end - start), 2),
        "response_status_code": response_status_code
    }


if __name__ == "__main__":
    check_anonymity_url = "http://www.xxorg.com/tools/checkproxy/"
    fetch_result = fetch(check_anonymity_url,
                         Proxy_IP(ip_and_port="194.246.105.52:53281"))
    print("fetch_result", fetch_result)
Exemple #11
0
def json_proxy():
    data = []
    jsonfile = open(jsonpath, encoding='utf-8')
    proxylist = json.load(jsonfile)
    jsonfile.close()
    if proxylist:
        for proxy in proxylist:
            proxyurl = proxy['proxy']
            # 端口是3888的为私有代理
            pattern = ':3888$'
            if not re.search(pattern, proxyurl):
                # if proxyurl != "http://192.168.88.176:3888":
                fetch_result = fetch(url=fetch_url,
                                     proxy=proxyurl,
                                     proxy_type='https')
                response = fetch_result['response_status_code']
                # 查询代理IP是否在DB中
                ip_and_port = proxyurl.split('/')[-1]
                httptype = proxyurl.split(':')[0]
                proxies = Proxy_IP.select().where(
                    Proxy_IP.ip_and_port == ip_and_port,
                    Proxy_IP.type == httptype).first()
                # print("proxies", proxies)
                # 构建对象
                proxyinfo = Proxy_IP(ip_and_port=ip_and_port)
                proxyinfo.ip_and_port = ip_and_port
                proxyinfo.timestamp = datetime.datetime.now()

                if proxies:
                    # IP在DB中
                    if response == 200:
                        update_proxy_score(proxyinfo, res=1)
                        data.append(proxy)
                        logger.info(
                            "from jsonfile add proxyinfo:{} ".format(proxy))
                    else:
                        update_proxy_score(proxyinfo)
                        logger.info(
                            "proxy response is not 200, cancel from jsonfile, proxy info:{} "
                            .format(proxy))
                else:
                    # IP不在DB中
                    proxyinfo.type = 'https'
                    proxyinfo.anonymity = 'high_anonymity'
                    proxyinfo.round_trip_time = '1'
                    proxyinfo.country = 'China'
                    proxyinfo.all_times = '1'
                    proxyinfo.timestamp = datetime.datetime.now()
                    if response == 200:
                        proxyinfo.right_times = '1'
                        save_proxy_to_db(proxyinfo)
                        data.append(proxy)
                        logger.info(
                            "from jsonfile add proxyinfo:{} ".format(proxy))
                    else:
                        proxyinfo.right_times = '1'
                        save_proxy_to_db(proxyinfo)
                        logger.info(
                            "proxy response is not 200, cancel from jsonfile, proxy info:{} "
                            .format(proxy))
    return data
Exemple #12
0
        response.encoding = 'utf-8'
        html = response.text
        if "豆瓣读书,新书速递,畅销书,书评,书单" in html:
            proxy.round_trip_time = fetch_result['round_trip_time']
            save_proxy_to_db(proxy)
        else:
            if self.recheck:
                delete_proxy_from_db(proxy)
            return

    def _check_one_proxy(self, proxy):
        if proxy.type == 'http':
            self._check_one_http_proxy(proxy)
        else:
            self._check_one_https_proxy(proxy)

    def run(self, ):
        for proxy in self.proxies:
            self.pool.spawn(self._check_one_proxy, proxy)
        self.pool.join()


if __name__ == "__main__":
    logger.info("-------Recheck Start-------")
    check_proxy = Check_proxy()
    check_proxy.recheck = True
    proxies = Proxy_IP.select()
    check_proxy.proxies.extend(proxies)
    check_proxy.run()
    logger.info("-------Recheck Finish-------")