def parse_ip_proxy(self, url): proxy = random.choice(self.proxypool) fetch_result = fetch(url, proxy) response = fetch_result['response'] if not response: logger.info('response is None , url:{}, proxy:{}'.format( url, proxy)) return response.encoding = 'utf-8' response_status_code = response.status_code print('response is status_code:{}, url:{}, proxy:{}'.format( response_status_code, url, proxy)) html = response.text soup = BeautifulSoup(html, "html5lib") trs = soup.find('table', id="ip_list").find('tbody').find_all('tr')[1:] for tr in trs: tds = tr.find_all('td') ip_and_port = tds[1].string + ":" + tds[2].string # proxy = Proxy_IP(ip_and_port=ip_and_port, type='https') proxy = Proxy_IP(ip_and_port=ip_and_port) if tds[4].string == '高匿': proxy.anonymity = 'high_anonymity' elif tds[4].string == '透明': proxy.anonymity = 'transparent' proxy.country = 'China' httptype = tds[5].string if httptype == 'HTTPS': proxy.type = 'https' self.proxy_list.add(proxy) logger.info(self.__class__.__name__ + " " + ip_and_port + " " + proxy.anonymity)
def json_proxy(): data = [] jsonfile = open(jsonpath, encoding='utf-8') proxylist = json.load(jsonfile) jsonfile.close() if proxylist: for proxy in proxylist: proxyurl = proxy['proxy'] # 端口是3888的为私有代理 pattern = ':3888$' if not re.search(pattern, proxyurl): # if proxyurl != "http://192.168.88.176:3888": fetch_result = fetch(url=fetch_url, proxy=proxyurl, proxy_type='https') response = fetch_result['response_status_code'] # 查询代理IP是否在DB中 ip_and_port = proxyurl.split('/')[-1] httptype = proxyurl.split(':')[0] proxies = Proxy_IP.select().where( Proxy_IP.ip_and_port == ip_and_port, Proxy_IP.type == httptype).first() # print("proxies", proxies) # 构建对象 proxyinfo = Proxy_IP(ip_and_port=ip_and_port) proxyinfo.ip_and_port = ip_and_port proxyinfo.timestamp = datetime.datetime.now() if proxies: # IP在DB中 if response == 200: update_proxy_score(proxyinfo, res=1) data.append(proxy) logger.info( "from jsonfile add proxyinfo:{} ".format(proxy)) else: update_proxy_score(proxyinfo) logger.info( "proxy response is not 200, cancel from jsonfile, proxy info:{} " .format(proxy)) else: # IP不在DB中 proxyinfo.type = 'https' proxyinfo.anonymity = 'high_anonymity' proxyinfo.round_trip_time = '1' proxyinfo.country = 'China' proxyinfo.all_times = '1' proxyinfo.timestamp = datetime.datetime.now() if response == 200: proxyinfo.right_times = '1' save_proxy_to_db(proxyinfo) data.append(proxy) logger.info( "from jsonfile add proxyinfo:{} ".format(proxy)) else: proxyinfo.right_times = '1' save_proxy_to_db(proxyinfo) logger.info( "proxy response is not 200, cancel from jsonfile, proxy info:{} " .format(proxy)) return data