Beispiel #1
0
 def _check_one_http_proxy(self, proxy):
     check_anonymity_url = "http://www.xxorg.com/tools/checkproxy/"
     fetch_result = fetch(check_anonymity_url, proxy)
     response = fetch_result['response']
     if response is None:
         if self.recheck:
             delete_proxy_from_db(proxy)
         return
     response.encoding = 'utf-8'
     html = response.text
     result = BeautifulSoup(html, "html5lib").find("div", id="result")
     anonymities = {
         "透明": "transparent",
         "普通匿名": "normal_anonymity",
         "高匿名": "high_anonymity"
     }
     for anonymity in anonymities.keys():
         if anonymity in str(result):
             proxy.anonymity = anonymities[anonymity]
             check_address_url = "http://ip-api.com/json/"
             fetch_result = fetch(check_address_url, proxy)
             response = fetch_result['response']
             if response is None:
                 if self.recheck:
                     delete_proxy_from_db(proxy)
                 return
             try:
                 proxy.country = response.json()['country']
                 proxy.round_trip_time = fetch_result['round_trip_time']
                 save_proxy_to_db(proxy)
             except JSONDecodeError:
                 delete_proxy_from_db(proxy)
                 return
             break
Beispiel #2
0
    def _check_one_https_proxy(self, proxy):
        testURL = "https://book.douban.com/"
        fetch_result = fetch(url=testURL, proxy=proxy, proxy_type='https')
        response = fetch_result['response']

        spiderURL = "https://cn.investing.com/"
        spider_fetch_result = fetch(url=spiderURL, proxy=proxy, proxy_type='https')
        spider_response_status = spider_fetch_result['response_status_code']
        if spider_response_status == 200:
            print("proxy cn.investing.com", proxy, spider_response_status)
        else:
            spider_response_status = None

        if response is None or spider_response_status != 200:
            logger.info('response is None , proxy:{}'.format(proxy))
            if self.recheck:
                update_proxy_score(proxy, res=0)
            return
        response.encoding = 'utf-8'
        html = response.text
        if "豆瓣读书,新书速递,畅销书,书评,书单" in html:
            proxy.round_trip_time = fetch_result['round_trip_time']
            update_proxy_score(proxy)
            # save_proxy_to_db(proxy)
        else:
            if self.recheck:
                update_proxy_score(proxy, res=0)
            return
Beispiel #3
0
 def parse_ip_proxy(self, url):
     proxy = random.choice(self.proxypool)
     fetch_result = fetch(url, proxy)
     response = fetch_result['response']
     if not response:
         logger.info('response is None , url:{}, proxy:{}'.format(
             url, proxy))
         return
     response.encoding = 'utf-8'
     response_status_code = response.status_code
     print('response is status_code:{}, url:{}, proxy:{}'.format(
         response_status_code, url, proxy))
     html = response.text
     soup = BeautifulSoup(html, "html5lib")
     trs = soup.find('table', id="ip_list").find('tbody').find_all('tr')[1:]
     for tr in trs:
         tds = tr.find_all('td')
         ip_and_port = tds[1].string + ":" + tds[2].string
         # proxy = Proxy_IP(ip_and_port=ip_and_port, type='https')
         proxy = Proxy_IP(ip_and_port=ip_and_port)
         if tds[4].string == '高匿':
             proxy.anonymity = 'high_anonymity'
         elif tds[4].string == '透明':
             proxy.anonymity = 'transparent'
         proxy.country = 'China'
         httptype = tds[5].string
         if httptype == 'HTTPS':
             proxy.type = 'https'
             self.proxy_list.add(proxy)
         logger.info(self.__class__.__name__ + " " + ip_and_port + " " +
                     proxy.anonymity)
Beispiel #4
0
def db_proxy():
    data = []
    proxies = Proxy_IP.select().where(Proxy_IP.type == 'https').order_by(
        Proxy_IP.timestamp)
    for proxy in proxies:
        r_times = int(proxy.right_times)
        a_times = int(proxy.all_times)
        success_rate = r_times * 1.0 / a_times
        ip_and_port = proxy.ip_and_port
        httptype = proxy.type
        proxyurl = httptype + "://" + ip_and_port
        logger.info("db proxyurl is {}".format(proxyurl))
        fetch_result = fetch(url=fetch_url, proxy=proxyurl, proxy_type='https')
        response = fetch_result['response_status_code']
        retry_num = fetch_result['retry_num']
        retry_success_rate = retry_num * 1.0 / RETRY_NUM
        # 总成功率超过60%,最近一个时刻尝试2次(总重试次数为3)就成功的代理
        if success_rate > 0.6 and response == 200 and retry_success_rate < 0.7:
            update_proxy_score(proxy, res=1)
            one_proxy_data_dic = {
                "proxy": proxyurl,
                "proxy_scheme": proxy.type
            }
            data.append(one_proxy_data_dic)
            logger.info("from db add proxyinfo:{} ".format(one_proxy_data_dic))
        # 成功率低于30%的代理在DB中减少成功次数,成功次数低于0则删除记录
        else:
            logger.info(
                "proxy success is too low, proxy info:{}, latest response_status_code:{}"
                .format(proxyurl, response))
            # delete_proxy_from_db(proxy)
            update_proxy_score(proxy)
    return data
Beispiel #5
0
 def parse_ip_proxy(self, url):
     fetch_result = fetch(url)
     response = fetch_result['response']
     response.encoding = 'gbk'
     html = response.text
     soup = BeautifulSoup(html, "html5lib")
     trs = soup.find('div', id="main").find('tbody').find_all('tr')[1:]
     for tr in trs:
         tds = tr.find_all('td')
         ip_and_port = tds[0].string + ":" + tds[1].string
         self.proxy_list.add(Proxy_IP(ip_and_port=ip_and_port))
         logger.info(self.__class__.__name__ + " " + ip_and_port)
Beispiel #6
0
 def _check_one_https_proxy(self, proxy):
     testURL = "https://book.douban.com/"
     fetch_result = fetch(url=testURL, proxy=proxy, proxy_type='https')
     response = fetch_result['response']
     if response is None:
         logger.info('response is None , proxy:{}'.format(proxy))
         if self.recheck:
             delete_proxy_from_db(proxy)
         return
     response.encoding = 'utf-8'
     html = response.text
     if "豆瓣读书,新书速递,畅销书,书评,书单" in html:
         proxy.round_trip_time = fetch_result['round_trip_time']
         save_proxy_to_db(proxy)
     else:
         if self.recheck:
             delete_proxy_from_db(proxy)
         return
 def run(self):
     fetch_result = fetch(self.start_url)
     response = fetch_result['response']
     if response:
         self.parse_ip_proxy(response)
Beispiel #8
0
def json_proxy():
    data = []
    jsonfile = open(jsonpath, encoding='utf-8')
    proxylist = json.load(jsonfile)
    jsonfile.close()
    if proxylist:
        for proxy in proxylist:
            proxyurl = proxy['proxy']
            # 端口是3888的为私有代理
            pattern = ':3888$'
            if not re.search(pattern, proxyurl):
                # if proxyurl != "http://192.168.88.176:3888":
                fetch_result = fetch(url=fetch_url,
                                     proxy=proxyurl,
                                     proxy_type='https')
                response = fetch_result['response_status_code']
                # 查询代理IP是否在DB中
                ip_and_port = proxyurl.split('/')[-1]
                httptype = proxyurl.split(':')[0]
                proxies = Proxy_IP.select().where(
                    Proxy_IP.ip_and_port == ip_and_port,
                    Proxy_IP.type == httptype).first()
                # print("proxies", proxies)
                # 构建对象
                proxyinfo = Proxy_IP(ip_and_port=ip_and_port)
                proxyinfo.ip_and_port = ip_and_port
                proxyinfo.timestamp = datetime.datetime.now()

                if proxies:
                    # IP在DB中
                    if response == 200:
                        update_proxy_score(proxyinfo, res=1)
                        data.append(proxy)
                        logger.info(
                            "from jsonfile add proxyinfo:{} ".format(proxy))
                    else:
                        update_proxy_score(proxyinfo)
                        logger.info(
                            "proxy response is not 200, cancel from jsonfile, proxy info:{} "
                            .format(proxy))
                else:
                    # IP不在DB中
                    proxyinfo.type = 'https'
                    proxyinfo.anonymity = 'high_anonymity'
                    proxyinfo.round_trip_time = '1'
                    proxyinfo.country = 'China'
                    proxyinfo.all_times = '1'
                    proxyinfo.timestamp = datetime.datetime.now()
                    if response == 200:
                        proxyinfo.right_times = '1'
                        save_proxy_to_db(proxyinfo)
                        data.append(proxy)
                        logger.info(
                            "from jsonfile add proxyinfo:{} ".format(proxy))
                    else:
                        proxyinfo.right_times = '1'
                        save_proxy_to_db(proxyinfo)
                        logger.info(
                            "proxy response is not 200, cancel from jsonfile, proxy info:{} "
                            .format(proxy))
    return data