def parse_dir_contents(self, response): for sel in response.xpath('//ul/li'): item = DmozItem() item['title'] = sel.xpath('a/text()').extract() item['link'] = sel.xpath('a/@href').extract() item['desc'] = sel.xpath('text()').extract() yield item
def parse(self, response): for sel in response.xpath('//ul[@class="directory-url"]/li'): item = DmozItem() item['title'] = sel.xpath('a/text()').extract() item['link'] = sel.xpath('a/@href').extract() item['desc'] = sel.xpath('text()').extract() # result is a list yield item
def parse(self, response): sel = HtmlXPathSelector(response) sites = sel.select('//ul/li') items = [] for site in sites: item = DmozItem() item['title'] = site.select('a/text()').extract() item['link'] = site.select('a/@href').extract() item['desc'] = site.select('text()').extract() items.append(item) return items
def parse_list(self,response): print(response.body) hxs = HtmlXPathSelector(response) urlList = hxs.select('//div[@class="pack_pic"]//a/@href').extract() for url in urlList: item = DmozItem() item['_id'] =ObjectId() url = 'http://findicons.com%s'%url meta = {'item':item} yield Request(url, callback=self.parse_items,meta=meta)
def parse(self, response): for i in response.xpath('//ul/li'): item = DmozItem() item['title'] = i.xpath('a/text()').extract() item['link'] = i.xpath('a/@href').extract() item['desc'] = i.xpath('text()').extract() yield item # title = i.xpath('a/text()').extract() # link = i.xpath('a/@href').extract() # desc = i.xpath('text()').extract() # print(title, link, desc) filename = response.url.split('/')[-2] + '.json' with open(filename, 'wb') as f: f.write(response.body)
def parse(self, response): #-2 is python #response has attribute url,we can see the attribute through F12 # filename = response.url.split("/")[-2] # with open(filename,"wb")as f: # f.write(response.body) ulli = response.xpath("//ul/li") for sel in ulli: # item item = DmozItem() item["title"] = sel.xpath("a/text()").extract() item["link"] = sel.xpath("a/@href").extract() item['description'] = sel.xpath('text()').extract() yield item
def parse_item(self, response): if response.xpath('//*[@id="sites-section"]'): for site_item in response.xpath( "//*[@id='site-list-content']/div[@class]"): l = ItemLoader(item=DmozItem(), response=response) l.add_value( 'url', site_item.xpath('./div[@class="title-and-desc"]/a/@href'). extract_first()) l.add_value( 'title', site_item.xpath( './div[@class="title-and-desc"]/a/div/text()').extract( )) l.add_value( 'descr', site_item.xpath( './div[@class="title-and-desc"]/div[@class="site-descr "]/text()' ).extract()) yield l.load_item()
def parse(self, response): selector = HtmlXPathSelector(response) # 创建选择器 table = selector.select('//*[starts-with(@id, "pid")]') # 取出所有的楼层 for each in table: # 对于每一个楼层执行下列操作 item = DmozItem() # 实例化一个Item对象 item['title'] = selector.select( '//*[@id="thread_subject"]/text()').extract()[0] item['author'] = \ each.select('tr[1]/td[@class="pls"]/div[@class="pls favatar"]/div[@class="pi"]/div[@class="authi"]/a/text()').extract()[0] item['post_time'] = \ each.select('tr[1]/td[@class="plc"]/div[@class="pi"]').re(r'[0-9]+-[0-9]+-[0-9]+ [0-9]+:[0-9]+:[0-9]+')[0].decode("unicode_escape") content_list = each.select('.//td[@class="t_f"]').select( 'string(.)').extract() content = "".join(content_list) # 将list转化为string item['url'] = response.url # 用这种方式获取网页的url # 把内容中的换行符,空格等去掉 item['content'] = content.replace('\r\n', '').replace(' ', '').replace( '\n', '') print "I am a test point"
def parse(self, response): soup = BeautifulSoup(response.body) items = [] length = len(soup.select('div .title-and-desc')) for i in range(length): item = DmozItem() item['title'] = soup.select('div .site-title')[i].string #.string获得节点内容 item['link'] = soup.select('div .title-and-desc > a')[i].get( 'href') #div标签下class=title-and-desc的直接子标签为'a'的 #.get('href')得到链接 item['desc'] = soup.select('div .site-descr')[i].get_text() #.get_text()得到内容 items.append(item) return items