def parse(self, response): self.driver.get(response.url) # self.driver.implicitly_wait(30) selector = Selector(text=self.driver.page_source) for sel in selector.xpath('//*[@id="J_Counter"]'): item = TbItem() #item['item_name'] = sel.xpath('//*[@id="J_Title"]/h3/text()').extract_first() item['item_name'] = sel.xpath( '//*[@id="J_Title"]/h3/@data-title').extract_first().replace( ",", ",").replace("\n", "。") item['item_id'] = parse_qs(urlparse(response.url).query, True)['id'][0].replace(",", ",").replace( "\n", "。") item['comments'] = sel.xpath( '//*[@id="J_RateCounter"]/text()').extract_first().replace( ",", ",").replace("\n", "。") item['trade'] = sel.xpath( '//*[@id="J_SellCounter"]/text()').extract_first().replace( ",", ",").replace("\n", "。") item['price'] = sel.xpath( '//*[@id="J_StrPrice"]/em[2]/text()').extract_first().replace( ",", ",").replace("\n", "。") yield item
def crawl_ips(): headers = { "user-agent": "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:15.0) Gecko/20100101 Firefox/15.0.1" } for i in range(1, 2): # 制定爬取的页数 这里只爬取西刺代理第1页 response = requests.get("http://www.xicidaili.com/nn/{0}".format(i), headers=headers) selector = Selector(text=response.text) all_trs = selector.xpath('//table[@id="ip_list"]//tr[position()>1]') ip_list = [] for tr in all_trs: speed = tr.xpath( "./td[@class='country'][3]//@title").extract()[0].split('秒')[0] ip = tr.xpath("./td[2]/text()").extract()[0] port = tr.xpath("./td[3]/text()").extract()[0] type = tr.xpath("./td[6]/text()").extract()[0] ip_list.append((ip, port, speed, type)) # 暂不进行ip有效性判断,存入数据后统一进行判别(利于提高效率) # if judge2_ip(ip,port): # ip_list.append((ip, port, speed, type)) # else: # pass # 方式1:将ip及port写入txt文件,只读'w'方式 with open('ips.txt', 'w', encoding='utf-8') as f: ip_list = tqdm(ip_list, desc='存入txt文件', leave=True) # 列表变为进度条 for ip_info in ip_list: f.writelines(ip_info[0] + ':' + ip_info[1] + '\n') # 方式2:将ip及port存入mysql指定数据库中的指定表中 conn = pymysql.connect(host='localhost', port=3306, user='******', passwd='123wangchao', charset='utf8', db='proxy_pool') cursor = conn.cursor() # 先清空原表 insert_sql1 = ''' truncate table proxy_ip ''' cursor.execute(insert_sql1) conn.commit() ip_list = tqdm(ip_list, desc='存入mysql ', leave=True) # 列表变为进度条 for ip_info in ip_list: insert_sql2 = ''' insert into proxy_ip(ip,port,speed,proxy_type) values('{0}','{1}','{2}','{3}')'''.format( ip_info[0], ip_info[1], ip_info[2], ip_info[3]) # print(insert_sql) cursor.execute(insert_sql2) conn.commit() print('**********************数据获取完成**********************')
def _parse_description(self): """ Parse the full description from the bug detail page :returns: bug description string """ # NOTE: Using the combination of text/type makes for better testing selector = scrapy.selector.Selector( text=self.response.body, type='html') # This mysterious bit will get all the text for the item description # but will also make sure we don't have any html tags xpath = ('//div[contains(@class, "issuedescription")]/' 'pre/descendant-or-self::*/text()') desc = ''.join(selector.xpath(xpath).extract()) # Remove stray HTML tags return desc.strip('\n')
def parse_animal_search_criteria(self, response): selector = scrapy.selector.Selector(response=response) criteria_xpath = '//select[@id="cphSearchArea_ctrlAnimal_ctrlAnimalSearch_ddlCriteria"]/option' options = selector.xpath(criteria_xpath) return {option.xpath('text()').extract_first(): int(option.xpath('@value').extract_first()) for option in options}