def delete_proxy_from_db(proxy): try: saved_proxy = Proxy_IP.get(Proxy_IP.ip_and_port == proxy.ip_and_port) if saved_proxy.delete_instance() == 1: logger.info("{} deleted from database".format(proxy)) except DoesNotExist: pass
def db_proxy(): data = [] proxies = Proxy_IP.select().where(Proxy_IP.type == 'https').order_by( Proxy_IP.timestamp) for proxy in proxies: r_times = int(proxy.right_times) a_times = int(proxy.all_times) success_rate = r_times * 1.0 / a_times ip_and_port = proxy.ip_and_port httptype = proxy.type proxyurl = httptype + "://" + ip_and_port logger.info("db proxyurl is {}".format(proxyurl)) fetch_result = fetch(url=fetch_url, proxy=proxyurl, proxy_type='https') response = fetch_result['response_status_code'] retry_num = fetch_result['retry_num'] retry_success_rate = retry_num * 1.0 / RETRY_NUM # 总成功率超过60%,最近一个时刻尝试2次(总重试次数为3)就成功的代理 if success_rate > 0.6 and response == 200 and retry_success_rate < 0.7: update_proxy_score(proxy, res=1) one_proxy_data_dic = { "proxy": proxyurl, "proxy_scheme": proxy.type } data.append(one_proxy_data_dic) logger.info("from db add proxyinfo:{} ".format(one_proxy_data_dic)) # 成功率低于30%的代理在DB中减少成功次数,成功次数低于0则删除记录 else: logger.info( "proxy success is too low, proxy info:{}, latest response_status_code:{}" .format(proxyurl, response)) # delete_proxy_from_db(proxy) update_proxy_score(proxy) return data
def update_proxy_score(proxy, res=0): try: saved_proxy = Proxy_IP.get(Proxy_IP.ip_and_port == proxy.ip_and_port) all_times = int(saved_proxy.all_times) right_times = int(saved_proxy.right_times) saved_proxy.all_times = str(all_times + 1) saved_proxy.timestamp = datetime.datetime.now() # 计算重试过程中代理成功的次数 if res: saved_proxy.right_times = str(right_times + 1) else: saved_proxy.right_times = str(right_times - 1) # 根据成功次数判断对代理的操作 if int(saved_proxy.right_times) <= 0: # 执行删除记录操作 if saved_proxy.delete_instance() == 1: logger.info( "instability proxy:{} deleted from database".format(proxy)) else: logger.info("delete fail, nstability proxy:{}".format(proxy)) else: if saved_proxy.save() == 1: logger.info("{} update from database, new all_times:{}, new right_times:{}"\ .format(proxy, saved_proxy.all_times, saved_proxy.right_times)) except DoesNotExist: proxy.all_times = '1' proxy.right_times = '0' proxy.timestamp = datetime.datetime.now() if proxy.save() == 1: logger.info("{} saved into database".format(proxy))
def __init__(self): super(XicidailiSpider, self).__init__() urls = [ "http://www.xicidaili.com/wn/{}".format(k) for k in range(1, 100) ] for url in urls: self.url_list.put(url) self.proxypool = Proxy_IP.select().where(Proxy_IP.type == 'http')
def save_proxy_to_db(proxy): try: saved_proxy = Proxy_IP.get(Proxy_IP.ip_and_port == proxy.ip_and_port) saved_proxy.round_trip_time = proxy.round_trip_time saved_proxy.anonymity = proxy.anonymity saved_proxy.country = proxy.country saved_proxy.timestamp = datetime.datetime.now() if saved_proxy.save() == 1: logger.info("{} updated into database".format(saved_proxy)) except DoesNotExist: if proxy.save() == 1: logger.info("{} saved into database".format(proxy))
def parse_ip_proxy(self, url): fetch_result = fetch(url) response = fetch_result['response'] response.encoding = 'gbk' html = response.text soup = BeautifulSoup(html, "html5lib") trs = soup.find('div', id="main").find('tbody').find_all('tr')[1:] for tr in trs: tds = tr.find_all('td') ip_and_port = tds[0].string + ":" + tds[1].string self.proxy_list.add(Proxy_IP(ip_and_port=ip_and_port)) logger.info(self.__class__.__name__ + " " + ip_and_port)
def GET(self): get_input = web.input(_method='get') query_country = query_anonymity = query_number = query_type = None try: query_country = get_input.country except: pass try: query_anonymity = get_input.anonymity except: pass try: query_number = get_input.number except: pass try: query_type = get_input.type except: pass proxies = Proxy_IP.select().order_by(Proxy_IP.timestamp) updatetime = str(proxies[0].timestamp).split('.')[0] data = [] anonymity_level = { "transparent": 0, "anonymity": 1, "normal_anonymity": 1, "high_anonymity": 2 } for proxy in proxies: if query_country: if proxy.country != query_country: continue if query_type: if proxy.type != query_type: continue if query_anonymity: print(query_anonymity) if anonymity_level[ proxy.anonymity] < anonymity_level[query_anonymity]: continue one_proxy_data_dic = { "ip_and_port": proxy.ip_and_port, "country": proxy.country, "type": proxy.type, "anonymity": proxy.anonymity, "round_trip_time": proxy.round_trip_time } data.append(one_proxy_data_dic) if query_number: if query_number < len(data): data = data[0:query_number] return_dic = {"num": len(data), "updatetime": updatetime, "data": data} return json.dumps(return_dic)
def parse_ip_proxy(self, url): proxy = random.choice(self.proxypool) fetch_result = fetch(url, proxy) response = fetch_result['response'] if not response: logger.info('response is None , url:{}, proxy:{}'.format( url, proxy)) return response.encoding = 'utf-8' response_status_code = response.status_code print('response is status_code:{}, url:{}, proxy:{}'.format( response_status_code, url, proxy)) html = response.text soup = BeautifulSoup(html, "html5lib") trs = soup.find('table', id="ip_list").find('tbody').find_all('tr')[1:] for tr in trs: tds = tr.find_all('td') ip_and_port = tds[1].string + ":" + tds[2].string # proxy = Proxy_IP(ip_and_port=ip_and_port, type='https') proxy = Proxy_IP(ip_and_port=ip_and_port) if tds[4].string == '高匿': proxy.anonymity = 'high_anonymity' elif tds[4].string == '透明': proxy.anonymity = 'transparent' proxy.country = 'China' httptype = tds[5].string if httptype == 'HTTPS': proxy.type = 'https' self.proxy_list.add(proxy) logger.info(self.__class__.__name__ + " " + ip_and_port + " " + proxy.anonymity)
def parse_ip_proxy(self, response): html = response.text for proxy in re.findall(IP_PROXY_REGEX, html): self.proxy_list.add(Proxy_IP(ip_and_port=proxy[0])) logger.info(self.__class__.__name__ + " " + proxy[0])
response = None response_status_code = None retry_num = start = end = 0 for i in range(RETRY_NUM): try: if proxy is not None: kwargs["proxies"] = {proxy_type: str(proxy)} start = time.time() response = requests.get(url, **kwargs) end = time.time() if response: response_status_code = response.status_code break except Exception as e: time.sleep(1) retry_num += 1 continue return { "response": response, "retry_num": retry_num, "round_trip_time": round((end - start), 2), "response_status_code": response_status_code } if __name__ == "__main__": check_anonymity_url = "http://www.xxorg.com/tools/checkproxy/" fetch_result = fetch(check_anonymity_url, Proxy_IP(ip_and_port="194.246.105.52:53281")) print("fetch_result", fetch_result)
def json_proxy(): data = [] jsonfile = open(jsonpath, encoding='utf-8') proxylist = json.load(jsonfile) jsonfile.close() if proxylist: for proxy in proxylist: proxyurl = proxy['proxy'] # 端口是3888的为私有代理 pattern = ':3888$' if not re.search(pattern, proxyurl): # if proxyurl != "http://192.168.88.176:3888": fetch_result = fetch(url=fetch_url, proxy=proxyurl, proxy_type='https') response = fetch_result['response_status_code'] # 查询代理IP是否在DB中 ip_and_port = proxyurl.split('/')[-1] httptype = proxyurl.split(':')[0] proxies = Proxy_IP.select().where( Proxy_IP.ip_and_port == ip_and_port, Proxy_IP.type == httptype).first() # print("proxies", proxies) # 构建对象 proxyinfo = Proxy_IP(ip_and_port=ip_and_port) proxyinfo.ip_and_port = ip_and_port proxyinfo.timestamp = datetime.datetime.now() if proxies: # IP在DB中 if response == 200: update_proxy_score(proxyinfo, res=1) data.append(proxy) logger.info( "from jsonfile add proxyinfo:{} ".format(proxy)) else: update_proxy_score(proxyinfo) logger.info( "proxy response is not 200, cancel from jsonfile, proxy info:{} " .format(proxy)) else: # IP不在DB中 proxyinfo.type = 'https' proxyinfo.anonymity = 'high_anonymity' proxyinfo.round_trip_time = '1' proxyinfo.country = 'China' proxyinfo.all_times = '1' proxyinfo.timestamp = datetime.datetime.now() if response == 200: proxyinfo.right_times = '1' save_proxy_to_db(proxyinfo) data.append(proxy) logger.info( "from jsonfile add proxyinfo:{} ".format(proxy)) else: proxyinfo.right_times = '1' save_proxy_to_db(proxyinfo) logger.info( "proxy response is not 200, cancel from jsonfile, proxy info:{} " .format(proxy)) return data
response.encoding = 'utf-8' html = response.text if "豆瓣读书,新书速递,畅销书,书评,书单" in html: proxy.round_trip_time = fetch_result['round_trip_time'] save_proxy_to_db(proxy) else: if self.recheck: delete_proxy_from_db(proxy) return def _check_one_proxy(self, proxy): if proxy.type == 'http': self._check_one_http_proxy(proxy) else: self._check_one_https_proxy(proxy) def run(self, ): for proxy in self.proxies: self.pool.spawn(self._check_one_proxy, proxy) self.pool.join() if __name__ == "__main__": logger.info("-------Recheck Start-------") check_proxy = Check_proxy() check_proxy.recheck = True proxies = Proxy_IP.select() check_proxy.proxies.extend(proxies) check_proxy.run() logger.info("-------Recheck Finish-------")