def find_all(self): '''查询所有proxy''' cursor = self.proxies.find() for item in cursor: item.pop('_id') #删除_id键 proxy = Proxy(**item) yield proxy
def find(self, conditions=None, count=0): """ 根据条件查询代理IP :param conditions: # 字典形式的查询条件 :param count: 查询多少条数据 :return: 返回先按分数降序, 后按响应速度升序排列前count条数据, 如果count==0, 就查询所有的, """ # 如果没有conditions, 将conditions设置为{} if conditions is None: conditions = {} # 获取查询的游标地下 cursor = self.proxies.find(conditions, limit=count).sort([ ("score", pymongo.DESCENDING), ("speed", pymongo.ASCENDING) ]) # 创建一个list, 用于存储Proxy results = [] # 变量游标, 获取代理IP for item in cursor: # 创建Proxy对象 proxy = Proxy(item['ip'], item['port'], score=item['score'], protocol=item['protocol'], nick_type=item['nick_type'], speed=item['speed'], disable_domains=item['disable_domains']) # 把Proxy对象添加到结果集 results.append(proxy) # 返回查询的结果 return results
def find_all(self): '''查询集合中所有代理IP的功能''' cursor = self.proxies.find() # 列表 for item in cursor: # item字典 item.pop("_id") # 删除item字典的_id这个key proxy = Proxy(**item) yield proxy
def find_all(self): cursor = self.proxies.find() for item in cursor: # 删除 '_id' 字段 item.pop('_id') proxy = Proxy(**item) yield proxy
def find_all(self): cusor = self.proxies.find() for item in cusor: # yield item #删除_id 这个key item.pop('_id') proxy = Proxy(**item) yield proxy
def find_all(self): """数据库的查询操作""" cursor = self.proxies.find() for item in cursor: # 删除item中的“_id” item.pop('_id') proxy = Proxy(**item) yield proxy
def find_all(self): #查询所有 cursor = self.proxies.find() for item in cursor: #item中有_id proxy没有 item.pop('_id') proxy = Proxy(**item) yield proxy
def find_all(self): """2.4 查询所有代理IP的功能""" cursor = self.proxies.find() for item in cursor: # 删除_id这个key item.pop('_id') proxy = Proxy(**item) yield proxy
def find_all(self): """查询所有代理数据功能""" cursor = self.proxies.find() for item in cursor: # 删除 `_id` key item.pop('_id') proxy = Proxy(**item) yield proxy
def find_all(self): query_set = self.proxies.find() proxy_list = [] for item in query_set: item.pop('_id') proxy = Proxy(**item) proxy_list.append(proxy) return proxy_list
def get_proxies_from_page(self, page): element = etree.HTML(page) trs = element.xpath(self.group_xpath) for tr in trs: ip = self.get_first_from_list(tr.xpath(self.detail_xpath['ip'])) port = self.get_first_from_list(tr.xpath(self.detail_xpath['port'])) area = self.get_first_from_list(tr.xpath(self.detail_xpath['area'])) proxy = Proxy(ip, port, area) yield proxy
def find_all(self): """2.4 查找所有代理IP""" # 查找到的是所有IP的字典集合 cursor = self.proxies.find() for item in cursor: # 为了生成proxy对象返回,所以需要删除冗余的_id字段 item.pop('_id') # print(**item) proxy = Proxy(**item) yield proxy
def find_all(self): """ 查询所有代理IP的功能 :return: """ cursor = self.proxies.find() for item in cursor: item.pop('_id') proxy = Proxy(**item) yield proxy
def find(self, conditions={}, count=0): cursor = self.proxies.find(conditions, limit=count).sort([ ('score', pymongo.DESCENDING), ('speed', pymongo.ASCENDING) ]) proxy_list = [] for item in cursor: item.pop('_id') proxy = Proxy(**item) proxy_list.append(proxy) return proxy_list
def parse_proxies_from_page(self, page): """解析页面, 提取数据, 封装为Proxy对象""" element = etree.HTML(page) trs = element.xpath(self.group_xpath) for tr in trs: ip = self.get_first_from_page(tr.xpath(self.detail_xpath['ip'])) port = self.get_first_from_page(tr.xpath( self.detail_xpath['port'])) area = self.get_first_from_page(tr.xpath( self.detail_xpath['area'])) proxy = Proxy(ip, port, area=area) yield proxy
def get_proxies_from_page(self, page): html = etree.HTML(page) trs = html.xpath(self.group_xpath) for tr in trs: ip = tr.xpath(self.detail_xpath['ip'])[0] if len( tr.xpath(self.detail_xpath['ip'])) > 0 else None port = tr.xpath(self.detail_xpath['port'])[0] if len( tr.xpath(self.detail_xpath['port'])) > 0 else None area = tr.xpath(self.detail_xpath['area'])[0] if len( tr.xpath(self.detail_xpath['area'])) > 0 else None proxy = Proxy(ip=ip, port=port, area=area) yield proxy
def find_all(self): """ 查询数据库中所有的代理ip """ cursor = self.proxies.find() for item in cursor: # 删除_id键值对 item.pop('_id') proxy = Proxy(**item) # 生成器yield yield proxy
def find_all(self): ''' 查询所以ip代理的功能 :return: ''' cursor = self.proxies.find() for item in cursor: # print(item) item.pop('_id') # print(item) proxy = Proxy(**item) yield proxy
def get_proxies_from_page(self, page): """解析页面,提取数据,封装为Proxy对象""" element = etree.HTML(page) # 获取包含代理IP信息的标签列表 trs = element.xpath(self.group_xpath) # 遍历trs,获取代理IP相关信息 for tr in trs: ip = self.get_frist_from_list(tr.xpath(self.detail_xpath['ip'])) port = self.get_frist_from_list(tr.xpath(self.detail_xpath['port'])) area = self.get_frist_from_list(tr.xpath(self.detail_xpath['area'])) proxy = Proxy(ip, port, area=area) # 使用yield返回提取到的数据 yield proxy
def limit_find(self, conditions={}, count=0): '''根据条件进行查询, 可以指定查询数量, 先分数降序, 速度升序排, 保证优质的代理IP在上面''' cursor = self.proxies.find(conditions, limit=count).sort([ ('score', pymongo.DESCENDING), ('speed', pymongo.ASCENDING)]) # 接受查询所得代理IP proxy_list = [] for item in cursor: item.pop('_id') proxy = Proxy(**item) proxy_list.append(proxy) return proxy_list
def get_proxies_from_page(self, page): '''解析页面''' element = etree.HTML(page) # 获取包含代理ip信息的标签列表 trs = element.xpath(self.group_xpath) # 遍历trs 获取ip for tr in trs: ip = self.get_first_from_list(tr.xpath(self.detail_xpath['ip'])) port = self.get_first_from_list(tr.xpath( self.detail_xpath['port'])) area = self.get_first_from_list(tr.xpath( self.detail_xpath['area'])) proxy = Proxy(ip, port, area=area) yield proxy
def get_data(self, data): response = requests.post(self.url, data=data, headers=get_request_header()) content = response.content.decode() dict = json.loads(content) data = dict['ret_data']['html'] html = etree.HTML(data) trs = html.xpath(self.group_xpath) for tr in trs: ip = tr.xpath(self.detail_xpath['ip'])[0] port = tr.xpath(self.detail_xpath['port'])[0] area = tr.xpath(self.detail_xpath['area'])[0] proxy = Proxy(ip, port, area=area) yield proxy
def get_proxies_from_page(self,page): element = etree.HTML(page) # 获取包含代理IP信息的标签列表 trs = element.xpath(self.group_xpath) # 遍历trs,获取代理IP相关信息 for tr in trs: ip = tr.xpath(self.detail_xpath["ip"])[0] port = tr.xpath(self.detail_xpath["port"])[0] area = tr.xpath(self.detail_xpath["area"]) # 注:有的没有区域,需要进行判断 if area: area = area[0] else: area="" proxy = Proxy(ip,port,area=area) yield proxy
def find(self, conditions={}, count=0): """ :param conditions: 要查询的key:value :param count: 显示的数量 :return:返回一个满足要求的一个代理ip列表 """ # 对代理池的ip进行排序,score降序,speed升序 cursor = self.proxies.find(conditions, limit=count).sort( [('score', pymongo.DESCENDING), ('speed', pymongo.ASCENDING)]) proxy_list = [] for item in cursor: item.pop('_id') proxy = Proxy(**item) proxy_list.append(proxy) return proxy_list
def get_proxies_from_page(self, page): element = etree.HTML(page) trs = element.xpath(self.group_xpath) for tr in trs: ip_and_port = tr.xpath(self.detail_xpath['ip_and_port'])[0] ip_and_port = str(ip_and_port) ip = ip_and_port.split(':')[0] port = ip_and_port.split(':')[1] try: area = tr.xpath(self.detail_xpath['area'])[0] except Exception as ex: # logger.debug(ex) area = None proxy = Proxy(ip, port, area=area) yield proxy
def get_proxies_from_page(self, page): #解析提取数据,封装为Proxy对象 element = etree.HTML(page) print(element.text) #获取包含代理IP信息的标签列表 trs = element.xpath(self.group_xpath) # print(trs,"666") #遍历trs,获取代理IP相关信息 for tr in trs: ip = self.get_first_from_list(tr.xpath(self.detail_xpath['ip'])) port = self.get_first_from_list(tr.xpath( self.detail_xpath['port'])) area = self.get_first_from_list(tr.xpath( self.detail_xpath['area'])) proxy = Proxy(ip, port, area=area) yield proxy
def find(self, conditions={}, count=0): """ 根据条件进行查询 :param conditions:查询条件字典 :param count: 限制最多取出多少个ip :return: 返回满足要求的ip列表 """ cursor=self.proxies.find(conditions,limit=count).\ sort([('score',pymongo.DESCENDING),('speed',pymongo.ASCENDING)]) #准备列表用于存储指针 proxy_list = [] for item in cursor: item.pop('_id') proxy = Proxy(**item) proxy_list.append(proxy) return proxy_list
def get_proxies_from_page(self, page): """解析页面,提取数据,封装为Proxy对象""" element = etree.HTML(page) print(element) # 获取包含代理IP信息的标签列表 trs = element.xpath(self.group_xpath) print(trs) # 遍历trs,获取代理ip相关信息 for tr in trs: # 解析出的是一个list,加上[0]给变量赋值列表的值 ip = tr.xpath(self.detail_xpath['ip'])[0] port = tr.xpath(self.detail_xpath['port'])[0] area = tr.xpath(self.detail_xpath['area'])[0] proxy = Proxy(ip, port, area=area) # 使用yield返回提取到的数据 yield proxy
def find(self, conditions={}, count=0): ''' 根据指定条件进行查询 :param conditions: 查询条件字典 :param count: 查询数量 :return: 代理IP列表 ''' cursor = self.collection.find(conditions, limit=count).sort([ ('score', pymongo.DESCENDING), ('idle', pymongo.ASCENDING) ]) proxy_list = [] for item in cursor: item.pop('_id') proxy = Proxy(**item) proxy_list.append(proxy) return proxy_list
def get_proxies_from_page(self, page): element = etree.HTML(page) #包含代理信息标签列表 trs = element.xpath(self.group_xpath) #遍历trl,获取代理具体信息 for tr in trs: ip = tr.xpath(self.detail_xpath['ip'])[0].strip() port = tr.xpath(self.detail_xpath['port'])[0].strip() try: area = tr.xpath(self.detail_xpath['area'])[0].strip() except Exception as ex: #logger.debug(ex) area = None proxy = Proxy(ip, port, area=area) #生成器 yield proxy