def get_html_proxies(self,page_html): element = etree.HTML(page_html) trs = element.xpath(self.group_xpath) for tr in trs: ip = self.get_list_first(tr.xpath(self.detail_xpath['ip'])) port = self.get_list_first(tr.xpath(self.detail_xpath['port'])) area = self.get_list_first(tr.xpath(self.detail_xpath['area'])) proxy = Proxy(ip,port,area=area) yield proxy
def find_all(self): ''' 查询数据库中所有的代理ip ''' cursor = self.proxies.find() for item in cursor: #删除_id键值对 item.pop('_id') proxy = Proxy(**item) #生成器yield yield proxy
def limit_find(self, conditions={}, count=0): '''根据条件进行查询, 可以指定查询数量, 先分数降序, 速度升序排, 保证优质的代理IP在上面''' cursor = self.proxies.find(conditions, limit=count).sort([ ('score', pymongo.DESCENDING), ('speed', pymongo.ASCENDING)]) # 接受查询所得代理IP proxy_list = [] for item in cursor: item.pop('_id') proxy = Proxy(**item) proxy_list.append(proxy) return proxy_list
if res.ok: #响应速度 speed = round(cost_time, 2) #转换为字典 res_dict = json.loads(res.text) #获取请求来源ip origin_ip = res_dict['origin'] #获取响应请求头中'Proxy-Connection',若有,说明是匿名代理 proxy_connection = res_dict['headers'].get('Proxy-Conntion', None) if "," in origin_ip: #如果响应内容中的源ip中有‘,’分割的两个ip的话及时透明代理ip nick_type = 2 #透明 elif proxy_connection: #'Proxy-Connection'存在说明是匿名ip nick_type = 1 #匿名 else: nick_type = 0 #高匿 return True, nick_type, speed else: return False, nick_type, speed except Exception as e: #logger.exception(e) return False, nick_type, speed if __name__ == '__main__': proxy = Proxy('180.104.62.199', '9000') result = check_proxy(proxy) print(result)
把指定域名添加到指定IP的disable_domain列表中,没有才添加 ''' count = self.proxies.count_documents({ '_id': ip, 'disable_domains': domain }) if count == 0: self.proxies.update_one({'_id': ip}, {'$push': { 'disable_domains': domain }}) if __name__ == '__main__': mongo = MongoPool() #插入测试 proxy = Proxy('202.104.113.32', '53281') mongo.insert(proxy) #更新测试 #proxy = Proxy('202.104.113.32','8888') #mongo.update(proxy) #删除测试 #proxy = Proxy('202.104.113.32','8888') #mongo.delete(proxy) #查询所有测试 #for proxy in mongo.find_all(): #print(proxy)
if res.status_code == 200: # 响应速度 speed = round(cost_time, 2) # 转换为字典 res_dict = json.loads(res.text) # 获取请求来源ip origin_ip = res_dict['origin'] # 获取响应请求头中'Proxy-Connection',若有,说明是匿名代理 proxy_connection = res_dict['headers'].get('Proxy-Conntion', None) if "," in origin_ip: # 如果响应内容中的源ip中有‘,’分割的两个ip的话及时透明代理ip nick_type = 2 # 透明 elif proxy_connection: # 'Proxy-Connection'存在说明是匿名ip nick_type = 1 # 匿名 else: nick_type = 0 # 高匿 return True, nick_type, speed else: return False, nick_type, speed except Exception as e: # logger.exception(e) return False, nick_type, speed if __name__ == '__main__': proxy = Proxy('60.13.42.94', '9999') result = check_proxy(proxy) print(result)