def parse(self, response): # print response.body soup = BeautifulSoup(response.body) tags = soup.findAll("h4") for tag in tags: item = WechatprojectItem() item["title"] = tag.text # 其中在item.py中定义了title = Field() item["link"] = tag.find("a").get("href") # 其中在item.py中定义了link = Field() ############################################################################################# # yield item ## 只抓取当前页数据 # yield Request(url=item["link"], callback=self.parse2) ## 只抓取二级页面数据 yield Request(url=item["link"], meta={"item":item}, callback=self.parse2) ## 抓取当前页数和二级页面数据
def parse(self, response): # print response.body sel = Selector(response) sites = sel.xpath('//div[@class="txt-box"]/h4/a') for site in sites: item = WechatprojectItem() item["title"] = site.xpath("text()").extract() # 其中在item.py中定义了title = Field() item["link"] = site.xpath("@href").extract() # 其中在item.py中定义了link = Field() # yield item # 只抓取当前页数据 next_url = item["link"][0] # yield Request(url=next_url, callback=self.parse2) # 只抓取二级页面数据 yield Request(url=next_url, meta={"item": item}, callback=self.parse2) # 抓取当前页数和二级页面数据
def parse(self, response): print response.body sel = Selector(response) site = sel.xpath('//div[@id="sogou_vr_11002301_box_0"]') #soup=BeautifulSoup(response.body) #tag=soup.find("div",attrs={"id":"sogou_vr_11002301_box_0"}) #for site in sites: item = WechatprojectItem() #username = site.xpath('div[@class="txt-box"]/h3/em/text()').extract() # 其中在item.py中定义了title = Field() #item["username"]="".join(username) link = site.xpath("@href").extract() # 其中在item.py中定义了link = Field() item["link"] = "".join(link) # 其中在item.py中定义了link = Field() next_url = item["link"] yield Request(url=next_url, meta={"item":item}, callback=self.parse2) ## 抓取当前页数和二级页面数据
def parse(self, response): print "url: ", response.url print "parse: ", response.body sel = Selector(response) next_url = sel.xpath( '//*[@id="sogou_vr_11002301_box_0"]/div/div[2]/p[1]/a/@href' ).extract()[0] print "next_url: ", next_url item = WechatprojectItem() #username = site.xpath('div[@class="txt-box"]/h3/em/text()').extract() # 其中在item.py中定义了title = Field() #item["username"]="".join(username) # link = site.xpath("@href").extract() # 其中在item.py中定义了link = Field() item["link"] = next_url # 其中在item.py中定义了link = Field() # next_url = item["link"] print "next_url: ", next_url yield Request(url=next_url, meta={"item": item}, callback=self.parse2) ## 抓取当前页数和二级页面数据
def parse(self, response): print('shuai: start parse00 is: ' + response.url) # print response.body sel = Selector(response) sites = sel.xpath('//div[@class="txt-box"]/h3/a') for site in sites: item = WechatprojectItem() item['title'] = site.xpath("descendant::text()").extract( ) # 其中在item.py中定义了title = Field() item["link"] = site.xpath( "@href").extract() # 其中在item.py中定义了link = Field() # yield item ## 只抓取当前页数据 item["link"] = item["link"][0] item['title'] = "".join(item['title']) print('start open wechat title is: ' + item['title']) print('start open wechat link is: ' + item['link']) # yield Request(url=next_url, callback=self.parse2) ## 只抓取二级页面数据 yield Request(url=item["link"], meta={"item": item}, callback=self.parse2, dont_filter=True) ## 抓取当前页数和二级页面数据