コード例 #1
0
    def parse_item(self, response):

        item = MeinvItem()
        sel = Selector(response)
        item['title'] = sel.xpath(
            '//div[@class="content"]/h5/text()').extract()[0]
        item['img'] = sel.xpath(
            '//div[@class="content-pic"]/a/img/@src').extract()[0]
        item['origin_time'] = sel.xpath(
            '//div[@class="content-msg"]/text()').extract()[0][5:]
        # item['origin_time'] = datetime.strptime(item['origin_time'], "%Y-%m-%d %H:%M%S")
        # dt_string = '2011-07-15 13:00:00+00:00'
        # new_dt = dt_string[:19]
        # dt = datetime.strptime(new_dt, '%Y-%m-%d %H:%M:%S')
        item['url'] = response.url
        item['url_arrs'] = [item['url']]
        item['img_arrs'] = [item['img']]
        item['catalogue'] = self.catalogue(item['url'])
        item['create_time'] = datetime.now()
        item['is_show'] = True
        count = sel.xpath(
            '//div[@class="content-page"]/span[@class="page-ch"]/text()'
        ).extract()[0]
        count = int(count[1:-1])
        for num in range(2, count):
            item['url_arrs'].append(response.url[0:-5] + '_' + str(num) +
                                    '.html')
            item['img_arrs'].append(item['img'][0:-5] + str(num) + '.jpg')
            print(item)
        return item
コード例 #2
0
 def parse(self, response):
     columns = response.xpath('//div[@class="list_cont list_cont2 w1180"]')
     for column in columns:
         classify = column.xpath('.//h2/text()').get()
         urls = column.xpath('.//ul[@class="clearfix"]/li/a/img/@data-original').getall()
         item = MeinvItem(headline=classify, urls=urls)
         yield item
コード例 #3
0
 def load_item(self, d):
     item = MeinvItem();
     item['title'] = d.css('a::attr(title)').extract_first()
     item['url'] =  'http://m.92mntu.com'+d.css('a::attr(href)').extract_first()
     item['img'] = d.css('img::attr(src)').extract_first()
     item['img'] = item['img'].replace('www.','')
     item['img_arrs'] = []
     item['url_arrs'] = []
     item['catalogue'] = self.catalogue(item['url'])
     item['create_time'] = time.time()
     print d.css('a::attr(title)').extract_first()
     return item
コード例 #4
0
    def parse(self, response):
        meinv_item = MeinvItem()
        meinv_item['title'] = response.xpath(
            "/html/body/div[2]/div[2]/h1/text()").extract_first()
        meinv_item['imgurl'] = response.xpath(
            "//*[@id='picBody']/p/a[1]/img/@src").extract_first()
        yield meinv_item

        next_link = response.xpath("//*[@id='nl']/a/@href").extract()
        if next_link:
            next_link = next_link[0]
            yield scrapy.Request(
                "https://www.27270.com/ent/meinvtupian/2019/" + next_link,
                callback=self.parse)
コード例 #5
0
ファイル: img.py プロジェクト: VerwirrtBear/meinv
    def parse(self, response):
        img=[]
        item = MeinvItem()  # 实例化item
        next_p = response.css('a.a1::attr(href)').extract()[1]
        imgurls = response.css('img.content_img::attr(src)').extract() # 注意这里是一个集合也就是多张图片
        
        if next_p is not None:
            next_page =response.urljoin(next_p)
            yield scrapy.Request(next_page, callback=self.parse)
        item['imgurl']= imgurls
        yield item
        with open('a.txt','a') as f:
            f.write('%s' % img)

        pass
コード例 #6
0
    def parse_album(self, response):
        """
        获取单个图片的url  及  title    通过meta传递 来的图集title 及 url 对应关系
        :param response:
        :return:
        """
        item = MeinvItem()
        img_url = response.xpath(
            '//div[@class="articleBody"]/p/a/img/@src')[0].extract()
        img_title = response.xpath(
            '//div[@class="articleBody"]/p/a/img/@alt')[0].extract()
        item['image_title'] = img_title
        item['image_url'] = img_url
        item['album_title'] = response.meta['album_title']
        item['album_url'] = response.meta['album_url']
        item['tag'] = response.meta['tag']
        yield item
        yield Request(img_url,
                      callback=self.SaveImage,
                      meta={
                          'album_title': item['album_title'],
                          'img_title': item['album_title'],
                      })

        status = response.xpath('//div[@class="pages"]/ul').extract()
        next_page = response.xpath(
            '//div[@class="pages"]/ul/li[last()]/a/@href').extract()
        if status:
            if next_page:
                if '#' not in next_page[0]:
                    next_url = response.urljoin(next_page[0])
                    yield Request(next_url,
                                  callback=self.parse_album,
                                  meta={
                                      'album_title': item['album_title'],
                                      'album_url': item['album_url'],
                                      'tag': item['tag']
                                  })
                else:
                    print('*************最后一页了 别翻了 ***********')
            else:
                print('******* 找不到下一页  %s图集只有一张图片*********' %
                      response.meta['album_url'])
コード例 #7
0
 def parse_item(self, response):
     item = MeinvItem()
     #标题
     item['title'] = response.xpath(
         '//h2[@class="main-title"]/text()').extract_first('')
     #分类
     item['classify'] = response.xpath(
         '//div[@class="main-meta"]/span[1]/a/text()').extract_first('')
     #发布时间
     item['time'] = "".join(
         response.xpath('//div[@class="main-meta"]/span[2]/text()').
         extract_first('')).replace('发布于', '')
     #浏览量
     item['page_view'] = response.xpath(
         '//div[@class="main-meta"]/span[3]/text()').extract_first('')
     #图片链接
     item['image_link'] = response.xpath(
         '//div[@class="main-image"]/p/a/img/@src').extract()
     yield scrapy.Request(item['image_link'][0], callback=self.a)
コード例 #8
0
    def parse_item(self, response):
        sel = Selector(response)
        item = MeinvItem()

        if response.meta.has_key('item') == False:
            item['url'] = response.url
            item['title'] = sel.xpath(
                '//h2[@class="main-title"]/text()').extract_first()
            item['img'] = sel.xpath(
                '//div[@class="main-image"]/p/a/img/@src').extract_first()
            item['img_arrs'] = []
            item['catalogue'] = sel.xpath(
                '//a[@rel="category tag"]/text()').extract_first()
        else:
            item = response.meta['item']

        item['img_arrs'].append({
            'img_title':
            sel.xpath('//h2[@class="main-title"]/text()').extract_first(),
            'img_url':
            sel.xpath(
                '//div[@class="main-image"]/p/a/img/@src').extract_first()
        })
        item['create_time'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                            time.localtime())
        item['origin_time'] = sel.xpath(
            '//span[contains(text(), "-")]').extract_first()

        next_page = sel.xpath(
            '//div[@class="main-image"]/p/a/@href').extract_first()
        if next_page.find(item['url']) != -1:
            yield Request(next_page,
                          meta={'item': item},
                          callback=self.parse_item,
                          dont_filter=False,
                          errback=self.errback_httpbin)
        else:
            yield item
コード例 #9
0
    def parse(self, response):
        """
        获取全部图集的url,得到图集的title  及 url
        :param response:
        :return:
        """
        items = MeinvItem()
        a_tag_list = response.xpath('//div[@class="listBox"]/ul/li/a')
        for a_tag in a_tag_list:
            album_url = a_tag.xpath('@href')[0].extract()
            album_title = a_tag.xpath('@title')[0].extract()
            items['album_url'] = album_url
            items['album_title'] = album_title
            tag = response.url.split('/')[3]
            items['tag'] = tag
            self.album_count += 1
            yield Request(album_url,
                          callback=self.parse_album,
                          meta={
                              'album_url': items['album_url'],
                              'album_title': items['album_title'],
                              'tag': items['tag']
                          })

        next_page_status = response.xpath(
            '//div[@class="pages"]/ul/li[last()-1]/a/text()')[0].extract()
        if "下一页" in next_page_status:
            next_page = response.xpath(
                '//div[@class="pages"]/ul/li[last()-1]/a/@href')[0].extract()
            next_page_url = response.urljoin(next_page)
            yield scrapy.Request(next_page_url, callback=self.parse)
            time.sleep(1)
        items['album_count'] = self.album_count
        yield items['album_count']

        print('*******************%s共有  %s  套 ' % (tag, items['album_count']))