Ejemplo n.º 1
0
 def parse(self, response):
     # print response.body
     soup = BeautifulSoup(response.body)
     tags = soup.findAll("h4")
     for tag in tags:
         item = WechatprojectItem()
         item["title"] = tag.text # 其中在item.py中定义了title = Field()
         item["link"] = tag.find("a").get("href") # 其中在item.py中定义了link = Field()
         #############################################################################################
         # yield item ## 只抓取当前页数据
         # yield Request(url=item["link"], callback=self.parse2) ## 只抓取二级页面数据
         yield Request(url=item["link"], meta={"item":item}, callback=self.parse2) ## 抓取当前页数和二级页面数据
Ejemplo n.º 2
0
    def parse(self, response):
        # print response.body
        sel = Selector(response)
        sites = sel.xpath('//div[@class="txt-box"]/h4/a')
        for site in sites:
            item = WechatprojectItem()
            item["title"] = site.xpath("text()").extract() # 其中在item.py中定义了title = Field()
            item["link"] = site.xpath("@href").extract() # 其中在item.py中定义了link = Field()

            # yield item # 只抓取当前页数据
            next_url = item["link"][0]
            # yield Request(url=next_url, callback=self.parse2) # 只抓取二级页面数据
            yield Request(url=next_url, meta={"item": item}, callback=self.parse2)  # 抓取当前页数和二级页面数据
 def parse(self, response):
     print response.body
     sel = Selector(response)
     site = sel.xpath('//div[@id="sogou_vr_11002301_box_0"]')
     #soup=BeautifulSoup(response.body)
     #tag=soup.find("div",attrs={"id":"sogou_vr_11002301_box_0"})
     #for site in sites:
     item = WechatprojectItem()
     #username = site.xpath('div[@class="txt-box"]/h3/em/text()').extract() # 其中在item.py中定义了title = Field()
     #item["username"]="".join(username)
     link = site.xpath("@href").extract() # 其中在item.py中定义了link = Field()
     item["link"] = "".join(link) # 其中在item.py中定义了link = Field()
     next_url = item["link"]
     yield Request(url=next_url, meta={"item":item}, callback=self.parse2) ## 抓取当前页数和二级页面数据
Ejemplo n.º 4
0
 def parse(self, response):
     print "url: ", response.url
     print "parse: ", response.body
     sel = Selector(response)
     next_url = sel.xpath(
         '//*[@id="sogou_vr_11002301_box_0"]/div/div[2]/p[1]/a/@href'
     ).extract()[0]
     print "next_url: ", next_url
     item = WechatprojectItem()
     #username = site.xpath('div[@class="txt-box"]/h3/em/text()').extract() # 其中在item.py中定义了title = Field()
     #item["username"]="".join(username)
     # link = site.xpath("@href").extract() # 其中在item.py中定义了link = Field()
     item["link"] = next_url  # 其中在item.py中定义了link = Field()
     # next_url = item["link"]
     print "next_url: ", next_url
     yield Request(url=next_url, meta={"item": item},
                   callback=self.parse2)  ## 抓取当前页数和二级页面数据
Ejemplo n.º 5
0
 def parse(self, response):
     print('shuai: start parse00 is: ' + response.url)
     # print response.body
     sel = Selector(response)
     sites = sel.xpath('//div[@class="txt-box"]/h3/a')
     for site in sites:
         item = WechatprojectItem()
         item['title'] = site.xpath("descendant::text()").extract(
         )  # 其中在item.py中定义了title = Field()
         item["link"] = site.xpath(
             "@href").extract()  # 其中在item.py中定义了link = Field()
         # yield item ## 只抓取当前页数据
         item["link"] = item["link"][0]
         item['title'] = "".join(item['title'])
         print('start open wechat title is: ' + item['title'])
         print('start open wechat link is: ' + item['link'])
         # yield Request(url=next_url, callback=self.parse2) ## 只抓取二级页面数据
         yield Request(url=item["link"],
                       meta={"item": item},
                       callback=self.parse2,
                       dont_filter=True)  ## 抓取当前页数和二级页面数据