def get_proxies(self): get_log_config() proxy_model_list = [] print('正在爬取快代理......') response = super(KuaidailiSpider, self).get_proxies() pattern = re.compile( '<tr>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(' '.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?</tr>', re.S ) infos = re.findall(pattern, response.text) for item in infos: try: ip = item[0] port = item[1] anonymity = item[2] http_type = item[3] area = item[4] speed = item[5] print(ip + " | " + port + " | " + anonymity + " | " + http_type + " | " + area + " | " + speed) if http_type == 'HTTP' or http_type == 'HTTPS': proxy = Proxy() proxy.set_ip(ip) proxy.set_port(port) proxy.set_http_type(http_type.lower()) proxy.set_anonymity(anonymity) proxy.set_area(area) proxy.set_speed(speed) proxy.set_agent(self.agent) proxy.set_survival_time("") proxy_model_list.append(proxy) except Exception as e: logging.debug(e) logging.debug(f"抓取 {self.agent} 网站共计 {len(proxy_model_list)} 个代理") return proxy_model_list
def get_proxies(self): # 加载 Log 配置 get_log_config() proxy_model_list = [] print('正在爬取无忧代理……') response = super(Data5uSpider, self).get_proxies() selector = etree.HTML(response.text) infos = selector.xpath('//ul[@class="l2"]') for i, info in enumerate(infos): try: ip = info.xpath('//ul[@class="l2"]/span[1]/li/text()')[i] # ip port = info.xpath('//ul[@class="l2"]/span[2]/li/text()')[ i] # 端口 anonymity = info.xpath( '//ul[@class="l2"]/span[3]/li/a/text()')[i] # 匿名度 http_type = info.xpath( '//ul[@class="l2"]/span[4]/li/a/text()')[i] # 类型 area = info.xpath('//ul[@class="l2"]/span[6]/li/a[1]/text()')[ i] # 地区, 省 area = area + info.xpath( '//ul[@class="l2"]/span[6]/li/a[2]/text()')[i] # 地区, 市 speed = info.xpath('//ul[@class="l2"]/span[8]/li/text()')[ i] # 速度 print(ip + " | " + port + " | " + anonymity + " | " + http_type + " | " + area + " | " + speed + " | ") if http_type == 'http' or http_type == 'https': # print(http_type + "://" + ip + ":" + port) proxy = Proxy() proxy.set_ip(ip) proxy.set_port(port) proxy.set_http_type(http_type) proxy.set_anonymity(anonymity) proxy.set_area(area) proxy.set_speed(speed) proxy.set_agent(self.agent) proxy.set_survival_time("") proxy_model_list.append(proxy) else: pass except Exception as e: logging.debug(e) logging.debug("抓取 " + self.agent + " 网站共计 " + str(len(proxy_model_list)) + " 个代理") return proxy_model_list
def get_proxies(self): # 加载 Log 配置 get_log_config() proxy_model_list = [] print('正在爬取快代理……') response = super(KuaidailiSpider, self).get_proxies() pattern = re.compile( '<tr>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(' '.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?</tr>', re.S) infos = re.findall(pattern, response.text) for item in infos: try: ip = item[0] # ip port = item[1] # 端口 anonymity = item[2] # 匿名度 http_type = item[3] # 类型 area = item[4] # 地区 speed = item[5] # 速度 print(ip + " | " + port + " | " + anonymity + " | " + http_type + " | " + area + " | " + speed) if http_type == 'HTTP' or http_type == 'HTTPS': # print(type.lower() + "://" + ip + ":" + port) proxy = Proxy() proxy.set_ip(ip) proxy.set_port(port) proxy.set_http_type(http_type.lower()) proxy.set_anonymity(anonymity) proxy.set_area(area) proxy.set_speed(speed) proxy.set_agent(self.agent) proxy.set_survival_time("") proxy_model_list.append(proxy) except Exception as e: logging.debug(e) logging.debug("抓取 " + self.agent + " 网站共计 " + str(len(proxy_model_list)) + " 个代理") return proxy_model_list
def case_10(): dao = ProxyDBManager() dao.create_proxy_table() proxy = Proxy() ip = '125.115.141.6' port = 8118 http_type = 'HTTPS' anonymity = '高匿' area = '浙江宁波' speed = '0.148秒' agent = 'agent' survival_time = '4小时' proxy.set_ip(ip) proxy.set_port(port) proxy.set_type(http_type) proxy.set_anonymity(anonymity) # 处理空地区 if area is None: proxy.set_area('') else: proxy.set_area(area) proxy.set_speed(speed) proxy.set_agent(agent) proxy.set_survival_time(survival_time) dao.insert_proxy_table(proxy) proxy_address = dao.select_random_proxy() print(proxy_address) if 'http://' in proxy_address: proxy_address = proxy_address.replace('http://', '') else: proxy_address = proxy_address.replace('https://', '') old_ip = proxy_address.split(':')[0] print('old IP : ', old_ip) dao.plus_proxy_faild_time(old_ip)
def get_proxies(self): # 加载 Log 配置 get_log_config() proxy_model_list = [] print('正在爬取西刺代理……') response = super(XiciSpider, self).get_proxies() selector = etree.HTML(response.text) infos = selector.xpath('//tr[@class="odd"]') for i, info in enumerate(infos): try: ip = info.xpath('./td[2]/text()')[0] # ip port = info.xpath('./td[3]/text()')[0] # 端口 anonymity = info.xpath('./td[5]/text()')[0] # 匿名度 http_type = info.xpath('./td[6]/text()')[0] # 类型 area = info.xpath('./td[4]/a/text()')[0] # 地区 speed = info.xpath('./td[7]/div/@title')[0] # 速度 survival_time = info.xpath('./td[9]/text()')[0] # 存活时间 print(ip + " | " + port + " | " + anonymity + " | " + http_type + " | " + area + " | " + speed + " | " + survival_time) proxy = Proxy() proxy.set_ip(ip) proxy.set_port(port) proxy.set_http_type(http_type) proxy.set_anonymity(anonymity) # 处理空地区 if area is None: proxy.set_area('') else: proxy.set_area(area) proxy.set_speed(speed) proxy.set_agent(self.agent) proxy.set_survival_time(survival_time) proxy_model_list.append(proxy) except Exception as e: logging.debug(e) logging.debug("抓取 " + self.agent + " 网站共计 " + str(len(proxy_model_list)) + " 个代理") return proxy_model_list
def get_proxies(self): get_log_config() proxy_model_list = [] print('正在爬取西刺代理......') response = super(XiciSpider, self).get_proxies() selector = etree.HTML(response.text) infos = selector.xpath('//tr[@class="odd"]') for i, info in enumerate(infos): try: ip = info.xpath('./td[2]/text()')[0] port = info.xpath('./td[3]/text()')[0] anonymity = info.xpath('./td[5]/text()')[0] http_type = info.xpath('./td[6]/text()')[0] area = info.xpath('./td[4]/a/text()')[0] speed = info.xpath('./td[7]/div/@title')[0] survival_time = info.xpath('./td[9]/text()')[0] print(ip + " | " + port + " | " + anonymity + " | " + http_type + " | " + area + " | " + speed + " | " + survival_time) proxy = Proxy() proxy.set_ip(ip) proxy.set_port(port) proxy.set_http_type(http_type) proxy.set_anonymity(anonymity) if area is None: proxy.set_area("") else: proxy.set_area(area) proxy.set_speed(speed) proxy.set_agent(self.agent) proxy.set_survival_time(survival_time) proxy_model_list.append(proxy) print(len(proxy_model_list)) except Exception as e: logging.debug(e) logging.debug(f"抓取 {self.agent} 网站共计 {len(proxy_model_list)} 个代理") return proxy_model_list
def get_proxies(self): # 加载 Log 配置 get_log_config() proxy_model_list = [] print('正在爬取ip181……') response = super(Ip181Spider, self).get_proxies() # 这个网站的编码是 gb2312 response.encoding = 'gb2312' selector = etree.HTML(response.text) infos = selector.xpath('//div[@class="col-md-12"]/table/tbody/tr') for i, info in enumerate(infos): try: ip = info.xpath('./td[1]/text()')[0] # ip port = info.xpath('./td[2]/text()')[0] # 端口 anonymity = info.xpath('./td[3]/text()')[0] # 匿名度 http_type = info.xpath('./td[4]/text()')[0] # 类型 speed = info.xpath('./td[5]/text()')[0] # 速度 area = info.xpath('./td[6]/text()')[0] # 地区 # print(ip + " | " + port + " | " + anonymity + " | " + http_type + " | " + speed + " | " + area) if i == 1: # 把标题过滤掉 pass else: proxy = Proxy() proxy.set_ip(ip) proxy.set_port(port) if http_type == 'HTTP,HTTPS': proxy.set_http_type('http') else: proxy.set_http_type(http_type.lower()) proxy.set_anonymity(anonymity) proxy.set_area(area) proxy.set_speed(speed) proxy.set_agent(self.agent) proxy.set_survival_time("") proxy_model_list.append(proxy) except Exception as e: logging.debug(e) logging.debug("抓取 " + self.agent + " 网站共计 " + str(len(proxy_model_list)) + " 个代理") return proxy_model_list