コード例 #1
0
	def parse_dir_contents(self, response):
		for sel in response.xpath('//ul/li'):
			item = DmozItem()
			item['title'] = sel.xpath('a/text()').extract()
			item['link'] = sel.xpath('a/@href').extract()
			item['desc'] = sel.xpath('text()').extract()
			yield item
コード例 #2
0
	def parse(self, response):
		for sel in response.xpath('//ul[@class="directory-url"]/li'):
			item = DmozItem()
			item['title'] = sel.xpath('a/text()').extract()
			item['link'] = sel.xpath('a/@href').extract()
			item['desc'] = sel.xpath('text()').extract() # result is a list
			yield item
コード例 #3
0
    def parse(self, response):
        sel = HtmlXPathSelector(response)
        sites = sel.select('//ul/li')
        items = []
        for site in sites:
            item = DmozItem()
            item['title'] = site.select('a/text()').extract()
            item['link'] = site.select('a/@href').extract()
            item['desc'] = site.select('text()').extract()
            items.append(item)

        return items
コード例 #4
0
    def parse_list(self,response):
        print(response.body)
        hxs = HtmlXPathSelector(response)
        urlList = hxs.select('//div[@class="pack_pic"]//a/@href').extract()

        for url in urlList:
            item = DmozItem()
            item['_id'] =ObjectId()

            url = 'http://findicons.com%s'%url
            meta = {'item':item}
            yield Request(url, callback=self.parse_items,meta=meta)
コード例 #5
0
ファイル: dmoz_spider.py プロジェクト: foodish/python_project
	def parse(self, response):
		for i in response.xpath('//ul/li'):
			item = DmozItem()
			item['title'] = i.xpath('a/text()').extract()
			item['link'] = i.xpath('a/@href').extract()
			item['desc'] = i.xpath('text()').extract()
			yield item
			# title = i.xpath('a/text()').extract()
			# link = i.xpath('a/@href').extract()
			# desc = i.xpath('text()').extract()
			# print(title, link, desc)

		filename = response.url.split('/')[-2] + '.json'
		with open(filename, 'wb') as f:
			f.write(response.body)
コード例 #6
0
    def parse(self, response):
        #-2 is python
        #response has attribute url,we can see the attribute through F12
        # filename = response.url.split("/")[-2]
        # with open(filename,"wb")as f:
        #     f.write(response.body)

        ulli = response.xpath("//ul/li")
        for sel in ulli:
            # item
            item = DmozItem()
            item["title"] = sel.xpath("a/text()").extract()
            item["link"] = sel.xpath("a/@href").extract()
            item['description'] = sel.xpath('text()').extract()
            yield item
コード例 #7
0
 def parse_item(self, response):
     if response.xpath('//*[@id="sites-section"]'):
         for site_item in response.xpath(
                 "//*[@id='site-list-content']/div[@class]"):
             l = ItemLoader(item=DmozItem(), response=response)
             l.add_value(
                 'url',
                 site_item.xpath('./div[@class="title-and-desc"]/a/@href').
                 extract_first())
             l.add_value(
                 'title',
                 site_item.xpath(
                     './div[@class="title-and-desc"]/a/div/text()').extract(
                     ))
             l.add_value(
                 'descr',
                 site_item.xpath(
                     './div[@class="title-and-desc"]/div[@class="site-descr "]/text()'
                 ).extract())
             yield l.load_item()
コード例 #8
0
    def parse(self, response):
        selector = HtmlXPathSelector(response)  # 创建选择器

        table = selector.select('//*[starts-with(@id, "pid")]')  # 取出所有的楼层
        for each in table:  # 对于每一个楼层执行下列操作
            item = DmozItem()  # 实例化一个Item对象
            item['title'] = selector.select(
                '//*[@id="thread_subject"]/text()').extract()[0]
            item['author'] = \
                each.select('tr[1]/td[@class="pls"]/div[@class="pls favatar"]/div[@class="pi"]/div[@class="authi"]/a/text()').extract()[0]
            item['post_time'] = \
                each.select('tr[1]/td[@class="plc"]/div[@class="pi"]').re(r'[0-9]+-[0-9]+-[0-9]+ [0-9]+:[0-9]+:[0-9]+')[0].decode("unicode_escape")
            content_list = each.select('.//td[@class="t_f"]').select(
                'string(.)').extract()
            content = "".join(content_list)  # 将list转化为string
            item['url'] = response.url  # 用这种方式获取网页的url
            # 把内容中的换行符,空格等去掉
            item['content'] = content.replace('\r\n', '').replace(' ',
                                                                  '').replace(
                                                                      '\n', '')
            print "I am a test  point"
コード例 #9
0
    def parse(self, response):

        soup = BeautifulSoup(response.body)

        items = []

        length = len(soup.select('div .title-and-desc'))

        for i in range(length):
            item = DmozItem()

            item['title'] = soup.select('div .site-title')[i].string
            #.string获得节点内容
            item['link'] = soup.select('div .title-and-desc > a')[i].get(
                'href')
            #div标签下class=title-and-desc的直接子标签为'a'的
            #.get('href')得到链接
            item['desc'] = soup.select('div .site-descr')[i].get_text()
            #.get_text()得到内容
            items.append(item)

        return items