def parse(self, response): sel = Selector(response) #获取当前列表页中所有在售新标的url跳转链接(a节点) product_urls = sel.xpath('//div[@class="op"]/a').extract() for product_url in product_urls: soup = BeautifulSoup(product_url, 'lxml') if soup.a(text=True)[0].encode("utf-8") == "立即投资": print soup.a['href'] newUrl = 'http://www.anxinjr.com{}'.format(soup.a['href']) print "newUrl = ", newUrl yield scrapy.Request(newUrl, callback = self.parse_axjr, meta={'producturl': newUrl},encoding = 'utf-8')
def parse(self, response): sel = Selector(response) #获取当前列表页中所有在售新标的url跳转链接 product_urls = sel.xpath('//div[@class="invest_a"]/a').extract() for product_url in product_urls: print product_url soup = BeautifulSoup(product_url, 'lxml') #print soup.a(text=True)[0].encode("utf-8") if soup.a(text=True)[0].encode("utf-8") == "立即投资": #print soup.a['href'] newUrl = 'http://www.haifax.cn' + soup.a['href'] #print "newUrl = ", newUrl yield scrapy.Request(newUrl, callback = self.parse_id_fy, encoding = 'utf-8')
def test7(): file = open('templates/richinfo.html', 'rb') html = file.read() bs = BeautifulSoup(html, "html.parser") # print(bs.prettify()) # 缩进格式 t_list = bs("a") # t_list = bs.find_all("a") => t_list = bs("a") 两者是相等 for item in t_list: print(item) t_list = bs.a(text="新闻") # t_list = bs.a.find_all(text="新闻") => t_list = bs.a(text="新闻") 两者是相等 for item in t_list: print(item) print(bs.html.head.title) print(bs.body.div.div.div.div) t_list = bs.body.children for item in t_list: print(item)