def get_proxies(self): # 加载 Log 配置 get_log_config() proxy_model_list = [] print('正在爬取无忧代理……') response = super(Data5uSpider, self).get_proxies() selector = etree.HTML(response.text) infos = selector.xpath('//ul[@class="l2"]') for i, info in enumerate(infos): try: ip = info.xpath('//ul[@class="l2"]/span[1]/li/text()')[i] # ip port = info.xpath('//ul[@class="l2"]/span[2]/li/text()')[ i] # 端口 anonymity = info.xpath( '//ul[@class="l2"]/span[3]/li/a/text()')[i] # 匿名度 http_type = info.xpath( '//ul[@class="l2"]/span[4]/li/a/text()')[i] # 类型 area = info.xpath('//ul[@class="l2"]/span[6]/li/a[1]/text()')[ i] # 地区, 省 area = area + info.xpath( '//ul[@class="l2"]/span[6]/li/a[2]/text()')[i] # 地区, 市 speed = info.xpath('//ul[@class="l2"]/span[8]/li/text()')[ i] # 速度 print(ip + " | " + port + " | " + anonymity + " | " + http_type + " | " + area + " | " + speed + " | ") if http_type == 'http' or http_type == 'https': # print(http_type + "://" + ip + ":" + port) proxy = Proxy() proxy.set_ip(ip) proxy.set_port(port) proxy.set_http_type(http_type) proxy.set_anonymity(anonymity) proxy.set_area(area) proxy.set_speed(speed) proxy.set_agent(self.agent) proxy.set_survival_time("") proxy_model_list.append(proxy) else: pass except Exception as e: logging.debug(e) logging.debug("抓取 " + self.agent + " 网站共计 " + str(len(proxy_model_list)) + " 个代理") return proxy_model_list
def get_proxies(self): # 加载 Log 配置 get_log_config() proxy_model_list = [] print('正在爬取快代理……') response = super(KuaidailiSpider, self).get_proxies() pattern = re.compile( '<tr>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(' '.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?</tr>', re.S) infos = re.findall(pattern, response.text) for item in infos: try: ip = item[0] # ip port = item[1] # 端口 anonymity = item[2] # 匿名度 http_type = item[3] # 类型 area = item[4] # 地区 speed = item[5] # 速度 print(ip + " | " + port + " | " + anonymity + " | " + http_type + " | " + area + " | " + speed) if http_type == 'HTTP' or http_type == 'HTTPS': # print(type.lower() + "://" + ip + ":" + port) proxy = Proxy() proxy.set_ip(ip) proxy.set_port(port) proxy.set_http_type(http_type.lower()) proxy.set_anonymity(anonymity) proxy.set_area(area) proxy.set_speed(speed) proxy.set_agent(self.agent) proxy.set_survival_time("") proxy_model_list.append(proxy) except Exception as e: logging.debug(e) logging.debug("抓取 " + self.agent + " 网站共计 " + str(len(proxy_model_list)) + " 个代理") return proxy_model_list
def get_proxies(self): # 加载 Log 配置 get_log_config() proxy_model_list = [] print('正在爬取西刺代理……') response = super(XiciSpider, self).get_proxies() selector = etree.HTML(response.text) infos = selector.xpath('//tr[@class="odd"]') for i, info in enumerate(infos): try: ip = info.xpath('./td[2]/text()')[0] # ip port = info.xpath('./td[3]/text()')[0] # 端口 anonymity = info.xpath('./td[5]/text()')[0] # 匿名度 http_type = info.xpath('./td[6]/text()')[0] # 类型 area = info.xpath('./td[4]/a/text()')[0] # 地区 speed = info.xpath('./td[7]/div/@title')[0] # 速度 survival_time = info.xpath('./td[9]/text()')[0] # 存活时间 print(ip + " | " + port + " | " + anonymity + " | " + http_type + " | " + area + " | " + speed + " | " + survival_time) proxy = Proxy() proxy.set_ip(ip) proxy.set_port(port) proxy.set_http_type(http_type) proxy.set_anonymity(anonymity) # 处理空地区 if area is None: proxy.set_area('') else: proxy.set_area(area) proxy.set_speed(speed) proxy.set_agent(self.agent) proxy.set_survival_time(survival_time) proxy_model_list.append(proxy) except Exception as e: logging.debug(e) logging.debug("抓取 " + self.agent + " 网站共计 " + str(len(proxy_model_list)) + " 个代理") return proxy_model_list
def __init__(self): # 加载 Log 配置 get_log_config() self.connect = pymysql.connect( host='127.0.0.1', #数据库地址 port=3306, # 数据库端口 db='doctor', # 数据库名 user='******', # 数据库用户名 passwd='liu998wei', # 数据库密码 charset='utf8', # 编码方式 use_unicode=True) # 通过cursor执行增删查改 self.cursor = self.connect.cursor()
def get_proxies(self): get_log_config() proxy_model_list = [] print('正在爬取西刺代理......') response = super(XiciSpider, self).get_proxies() selector = etree.HTML(response.text) infos = selector.xpath('//tr[@class="odd"]') for i, info in enumerate(infos): try: ip = info.xpath('./td[2]/text()')[0] port = info.xpath('./td[3]/text()')[0] anonymity = info.xpath('./td[5]/text()')[0] http_type = info.xpath('./td[6]/text()')[0] area = info.xpath('./td[4]/a/text()')[0] speed = info.xpath('./td[7]/div/@title')[0] survival_time = info.xpath('./td[9]/text()')[0] print(ip + " | " + port + " | " + anonymity + " | " + http_type + " | " + area + " | " + speed + " | " + survival_time) proxy = Proxy() proxy.set_ip(ip) proxy.set_port(port) proxy.set_http_type(http_type) proxy.set_anonymity(anonymity) if area is None: proxy.set_area("") else: proxy.set_area(area) proxy.set_speed(speed) proxy.set_agent(self.agent) proxy.set_survival_time(survival_time) proxy_model_list.append(proxy) print(len(proxy_model_list)) except Exception as e: logging.debug(e) logging.debug(f"抓取 {self.agent} 网站共计 {len(proxy_model_list)} 个代理") return proxy_model_list
def get_proxies(self): # 加载 Log 配置 get_log_config() proxy_model_list = [] print('正在爬取ip181……') response = super(Ip181Spider, self).get_proxies() # 这个网站的编码是 gb2312 response.encoding = 'gb2312' selector = etree.HTML(response.text) infos = selector.xpath('//div[@class="col-md-12"]/table/tbody/tr') for i, info in enumerate(infos): try: ip = info.xpath('./td[1]/text()')[0] # ip port = info.xpath('./td[2]/text()')[0] # 端口 anonymity = info.xpath('./td[3]/text()')[0] # 匿名度 http_type = info.xpath('./td[4]/text()')[0] # 类型 speed = info.xpath('./td[5]/text()')[0] # 速度 area = info.xpath('./td[6]/text()')[0] # 地区 # print(ip + " | " + port + " | " + anonymity + " | " + http_type + " | " + speed + " | " + area) if i == 1: # 把标题过滤掉 pass else: proxy = Proxy() proxy.set_ip(ip) proxy.set_port(port) if http_type == 'HTTP,HTTPS': proxy.set_http_type('http') else: proxy.set_http_type(http_type.lower()) proxy.set_anonymity(anonymity) proxy.set_area(area) proxy.set_speed(speed) proxy.set_agent(self.agent) proxy.set_survival_time("") proxy_model_list.append(proxy) except Exception as e: logging.debug(e) logging.debug("抓取 " + self.agent + " 网站共计 " + str(len(proxy_model_list)) + " 个代理") return proxy_model_list
def __init__(self): get_log_config() self.__proxy_table = 'proxy' self.conn = pymysql.connect(host=config.MYSQL_HOST, port=3307, db=config.MYSQL_DBNAME, user=config.MYSQL_USER, passwd=config.MYSQL_PASSWORD, charset='utf8', use_unicode=False) with self.conn: self.cursor = self.conn.cursor()
def get_proxies(self): get_log_config() proxy_model_list = [] print('正在爬取快代理......') response = super(KuaidailiSpider, self).get_proxies() pattern = re.compile( '<tr>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(' '.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?</tr>', re.S ) infos = re.findall(pattern, response.text) for item in infos: try: ip = item[0] port = item[1] anonymity = item[2] http_type = item[3] area = item[4] speed = item[5] print(ip + " | " + port + " | " + anonymity + " | " + http_type + " | " + area + " | " + speed) if http_type == 'HTTP' or http_type == 'HTTPS': proxy = Proxy() proxy.set_ip(ip) proxy.set_port(port) proxy.set_http_type(http_type.lower()) proxy.set_anonymity(anonymity) proxy.set_area(area) proxy.set_speed(speed) proxy.set_agent(self.agent) proxy.set_survival_time("") proxy_model_list.append(proxy) except Exception as e: logging.debug(e) logging.debug(f"抓取 {self.agent} 网站共计 {len(proxy_model_list)} 个代理") return proxy_model_list
def __init__(self): # 加载 Log 配置 get_log_config() self.__proxy_table = 'proxy' self.conn = pymysql.connect( host=config.MYSQL_HOST, db=config.MYSQL_DBNAME, user=config.MYSQL_USER, passwd=config.MYSQL_PASSWORD, charset='utf8', # 编码要加上,否则可能出现中文乱码问题 use_unicode=False) with self.conn: self.cursor = self.conn.cursor()
停止爬虫和代理池 """ def stop(self): self.isRunning = False # 关闭资源 get_proxy_pool_worker().stop_work() """ 启动爬虫和代理池 """ def start(self): self.start_proxy_pool() self.start_spider() if __name__ == '__main__': # 加载 Log 配置 get_log_config() manager = SpiderManager() """ Demospider() 是 Scrapy 项目 spider 目录下的爬虫脚本名字 这里需要更换成 你项目的 爬虫名 """ spider_list = [ PengpaiSpider(), ] manager.start()