Esempio n. 1
0
 def parse_ip_proxy(self, url):
     proxy = random.choice(self.proxypool)
     fetch_result = fetch(url, proxy)
     response = fetch_result['response']
     if not response:
         logger.info('response is None , url:{}, proxy:{}'.format(
             url, proxy))
         return
     response.encoding = 'utf-8'
     response_status_code = response.status_code
     print('response is status_code:{}, url:{}, proxy:{}'.format(
         response_status_code, url, proxy))
     html = response.text
     soup = BeautifulSoup(html, "html5lib")
     trs = soup.find('table', id="ip_list").find('tbody').find_all('tr')[1:]
     for tr in trs:
         tds = tr.find_all('td')
         ip_and_port = tds[1].string + ":" + tds[2].string
         # proxy = Proxy_IP(ip_and_port=ip_and_port, type='https')
         proxy = Proxy_IP(ip_and_port=ip_and_port)
         if tds[4].string == '高匿':
             proxy.anonymity = 'high_anonymity'
         elif tds[4].string == '透明':
             proxy.anonymity = 'transparent'
         proxy.country = 'China'
         httptype = tds[5].string
         if httptype == 'HTTPS':
             proxy.type = 'https'
             self.proxy_list.add(proxy)
         logger.info(self.__class__.__name__ + " " + ip_and_port + " " +
                     proxy.anonymity)
Esempio n. 2
0
def json_proxy():
    data = []
    jsonfile = open(jsonpath, encoding='utf-8')
    proxylist = json.load(jsonfile)
    jsonfile.close()
    if proxylist:
        for proxy in proxylist:
            proxyurl = proxy['proxy']
            # 端口是3888的为私有代理
            pattern = ':3888$'
            if not re.search(pattern, proxyurl):
                # if proxyurl != "http://192.168.88.176:3888":
                fetch_result = fetch(url=fetch_url,
                                     proxy=proxyurl,
                                     proxy_type='https')
                response = fetch_result['response_status_code']
                # 查询代理IP是否在DB中
                ip_and_port = proxyurl.split('/')[-1]
                httptype = proxyurl.split(':')[0]
                proxies = Proxy_IP.select().where(
                    Proxy_IP.ip_and_port == ip_and_port,
                    Proxy_IP.type == httptype).first()
                # print("proxies", proxies)
                # 构建对象
                proxyinfo = Proxy_IP(ip_and_port=ip_and_port)
                proxyinfo.ip_and_port = ip_and_port
                proxyinfo.timestamp = datetime.datetime.now()

                if proxies:
                    # IP在DB中
                    if response == 200:
                        update_proxy_score(proxyinfo, res=1)
                        data.append(proxy)
                        logger.info(
                            "from jsonfile add proxyinfo:{} ".format(proxy))
                    else:
                        update_proxy_score(proxyinfo)
                        logger.info(
                            "proxy response is not 200, cancel from jsonfile, proxy info:{} "
                            .format(proxy))
                else:
                    # IP不在DB中
                    proxyinfo.type = 'https'
                    proxyinfo.anonymity = 'high_anonymity'
                    proxyinfo.round_trip_time = '1'
                    proxyinfo.country = 'China'
                    proxyinfo.all_times = '1'
                    proxyinfo.timestamp = datetime.datetime.now()
                    if response == 200:
                        proxyinfo.right_times = '1'
                        save_proxy_to_db(proxyinfo)
                        data.append(proxy)
                        logger.info(
                            "from jsonfile add proxyinfo:{} ".format(proxy))
                    else:
                        proxyinfo.right_times = '1'
                        save_proxy_to_db(proxyinfo)
                        logger.info(
                            "proxy response is not 200, cancel from jsonfile, proxy info:{} "
                            .format(proxy))
    return data