Example #1
0
 def parse(self, response):
     sel = Selector(response)
     #获取当前列表页中所有在售新标的url跳转链接(a节点)
     product_urls  = sel.xpath('//div[@class="op"]/a').extract()
     for product_url in product_urls:
         soup = BeautifulSoup(product_url, 'lxml')
         if soup.a(text=True)[0].encode("utf-8") == "立即投资":
             print soup.a['href']
             newUrl = 'http://www.anxinjr.com{}'.format(soup.a['href'])
             print "newUrl = ", newUrl
             yield scrapy.Request(newUrl, callback = self.parse_axjr,
                                  meta={'producturl': newUrl},encoding = 'utf-8')
Example #2
0
File: haijinsuo.py Project: ifzz/py
 def parse(self, response):
     sel = Selector(response)
     #获取当前列表页中所有在售新标的url跳转链接
     product_urls  = sel.xpath('//div[@class="invest_a"]/a').extract()
     for product_url in product_urls:
         print product_url
         soup = BeautifulSoup(product_url, 'lxml')
         #print soup.a(text=True)[0].encode("utf-8")
         if soup.a(text=True)[0].encode("utf-8") == "立即投资":
             #print soup.a['href']
             newUrl = 'http://www.haifax.cn' + soup.a['href']
             #print "newUrl = ", newUrl
             yield scrapy.Request(newUrl, callback = self.parse_id_fy, encoding = 'utf-8')
Example #3
0
def test7():
    file = open('templates/richinfo.html', 'rb')
    html = file.read()
    bs = BeautifulSoup(html, "html.parser")
    # print(bs.prettify())  # 缩进格式
    t_list = bs("a")  # t_list = bs.find_all("a") => t_list = bs("a")  两者是相等
    for item in t_list:
        print(item)
    t_list = bs.a(text="新闻")  # t_list = bs.a.find_all(text="新闻") => t_list = bs.a(text="新闻") 两者是相等
    for item in t_list:
        print(item)

    print(bs.html.head.title)
    print(bs.body.div.div.div.div)
    t_list = bs.body.children
    for item in t_list:
        print(item)