Example #1
0
    def parse_review(self, response):
        sel = Selector(response)

        if not self._is_right_category(sel):
            self.log('Skip URL: %s' % response.url, level=log.INFO)
            return

        self.log('Parse URL: %s' % response.url, level=log.INFO)

        loader = ItemLoader(item=YelpReview(), selector=sel)
        loader.add_value('crawl_date', '%s' % datetime.utcnow())
        loader.add_value('page_url', response.url)

        # Loop over all the fields we need to extract.
        for field, selector in self._item_selectors.iteritems():
            loader.add_xpath(field, selector)

        master_review = loader.load_item()
        review_selectors = sel.xpath('//div[contains(@class, "review")][@itemprop="review"]')

        for rev_sel in review_selectors:
            review_loader = ItemLoader(item=master_review.copy(), selector=rev_sel)

            for field, selector in self._review_selectors.iteritems():
                review_loader.add_xpath(field, selector)

            yield review_loader.load_item()

        return
Example #2
0
 def parse_movie(self, response):
     loader = ItemLoader(item=MovieItem(), response=response)
     loader.add_xpath(
         'name',
         '//div[@id="title-overview-widget"]/div[2]/div[2]/div/div[2]/div[2]/h1/text()'
     )
     loader.add_xpath('year', "//h1/span[@id='titleYear']/a/text()")
     loader.add_xpath(
         'rate',
         "//div[@id='title-overview-widget']/div[2]/div[2]/div/div[1]/div[1]/div[1]/strong/span/text()"
     )
     loader.add_xpath('director',
                      "//div[2]/div[1]/div[2]/span/a/span/text()")
     loader.add_xpath('director',
                      "//div[3]/div[1]/div[2]/span/a/span/text()")
     loader.add_xpath('storyline',
                      "//div[@id='titleStoryLine']/div[1]/p/text()")
     user_review_url = response.xpath(
         "//div[@id='titleUserReviewsTeaser']/div/div[3]/a[2]/@href"
     ).extract()
     item = loader.load_item()
     user_review_another_url = response.xpath(
         "//div[@id='titleUserReviewsTeaser']/div/div[2]/a[2]/@href"
     ).extract()
     if user_review_url or user_review_another_url:
         full_url = 0
         if not user_review_another_url:
             full_url = urljoin(response.url, user_review_url.pop())
         elif not user_review_url:
             full_url = urljoin(response.url, user_review_another_url.pop())
         request = Request(urljoin(response.url, full_url),
                           callback=self.parse_audience_review)
         request.meta['item'] = item
         return request
     return item
Example #3
0
File: Xs.py Project: kevking/xs
 def parse(self, response):
     items = ItemLoader(item=XsContentItem(), response=response)
     #章节标题
     items.add_xpath('title', '//*[@class="bookname"]/h1/text()')
     #正文
     items.add_xpath('text', '//*[@id="content"]/text()')
     yield items.load_item()
Example #4
0
    def parse(self, response):
        for sel in response.css("ul#channels-browse-content-grid > li"):
            loader = ItemLoader(YoutubeVideo(), selector=sel)
 
            loader.add_xpath('link', './/h3/a/@href')
            
            yield loader.load_item()
    def create_tag_items (self, task_id, item_id):
        tag_items = []

        l = ItemLoader (FeedEntryTag())
        l.add_value ('feed_entry_id', item_id)
        l.add_value ('tag', 'NRC')
        tag_items.append(l.load_item())

        nrc_tags = self.db.loadNrcTags(task_id)
        for t in nrc_tags:
            l = ItemLoader (FeedEntryTag())
            l.add_value ('feed_entry_id', item_id)
            l.add_value ('tag', t['tag'])
            l.add_value ('comment', t['comment'])
            tag_items.append(l.load_item())
        return tag_items
 def create_tag(self, feed_entry_id, tag, comment=''):
     # TODO: create tags
     l = ItemLoader(FeedEntryTag())
     l.add_value('feed_entry_id', feed_entry_id)
     l.add_value('tag', tag)
     l.add_value('comment', comment)
     return l.load_item()
Example #7
0
    def parse_items(self, response):
        item = ItemLoader(Articulos(), response)     
        item.add_xpath('title', '//*[@id="MainContainer"]/article/section[1]/div[1]/div/h2/text()')
        item.add_xpath('description', '//*[@id="MainContainer"]/article/section[1]/div[2]/ul/li[3]/text()')
        yield item.load_item()

# scrapy runspider multiplepages.py -o ../../resources/computrabajo.csv -t csv
Example #8
0
 def parse_detail(self, response):
     url = response.url
     item = ItemLoader(item=MeizituItem(), response=response)
     item.add_xpath("title", "//h2/a/text()")
     item.add_xpath("image_urls", '//div[@id="picture"]//img/@src')
     item.add_value('url', url)
     return item.load_item()
Example #9
0
 def parse_item(self, response):
     # 解析http://www.meizitu.com/a/5336.html获取图片URL
     l = ItemLoader(item=MeizituItem(), response=response)
     l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src",
                 Identity())
     l.add_value('url', response.url)
     return l.load_item()
Example #10
0
    def parse(self, response):
        data = json.loads(response.text)
        for key in data.keys():
            dataLenth = len(data[key])
            logging.info("total size: " + str(dataLenth))
            for i in range(1, dataLenth):
                logging.info("curIndex: " + str(i))
                content = {}
                logging.info("title: " + data[key][i]["title"])

                loader = ItemLoader(item=NetNews(), response=response)

                content["title"] = data[key][i]["title"]
                loader.add_value("news_title", content["title"])

                content["digest"] = data[key][i]["digest"]
                loader.add_value("news_digest", content["digest"])
                # print "recSource: "+data[key][i]["recSource"]
                content["label"] = data[key][i]["recSource"]
                loader.add_value("news_label", content["label"])

                content["imageUrl"] = data[key][i]["imgsrc"]
                loader.add_value("news_img_url", content["imageUrl"])

                content["id"] = data[key][i]["id"]
                loader.add_value("news_id", content["id"])

                detail = self.getNewsDetail(content["id"])
                # print "body: "+detail
                if (len(detail) != 0):
                    loader.add_value("news_detail", detail)

                # 添加时间戳
                loader.add_value("timestamp", self.get_timestamp())
                yield loader.load_item()
Example #11
0
    def parse(self, response):
        contents = response.xpath(
            '//ul[@class="note-list"]/li/div[@class="content"]')
        for content in contents:
            l = ItemLoader(item=JianshuSummaryItem(),
                           selector=content,
                           response=response)
            l.add_xpath(
                'title', 'a[@class="title"]/text()',
                MapCompose(lambda i: i.replace('|', '.').replace('丨', '.')))
            l.add_xpath('link', 'a[@class="title"]/@href',
                        MapCompose(lambda i: urljoin(response.url, i)))
            l.add_xpath(
                'author',
                'div[@class="author"]/div[@class="info"]/a[@class="nickname"]/text()'
            )
            l.add_xpath(
                'author_url',
                'div[@class="author"]/div[@class="info"]/a[@class="nickname"]/@href',
                MapCompose(lambda i: urljoin(response.url, i)))
            l.add_xpath(
                'timestamp',
                'div[@class="author"]/div[@class="info"]/span[@class="time"]/@data-shared-at'
            )
            l.add_xpath('read', 'div[@class="meta"]/a[1]/text()[2]',
                        MapCompose(str.strip, int))
            l.add_xpath('reply', 'div[@class="meta"]/a[2]/text()[2]',
                        MapCompose(str.strip, int))
            l.add_xpath('like', 'div[@class="meta"]/span[1]/text()',
                        MapCompose(str.strip, int))
            l.add_xpath('money', 'div[@class="meta"]/span[2]/text()',
                        MapCompose(str.strip, int))

            yield l.load_item()
        pass
Example #12
0
    def parse(self, response):

        content = response.body
        page = response.url.split("/")[-1]
        """
        content = Selector(response=response).xpath("//div[@class='body textStyle']").extract()
        if (len(content)):
            content = content[0]
        #踢除标签
        strip = StripTags()
        content = strip.filterTags(content)
        #写文件
        filename = 'quotes-%s' % page
        with open(filename, 'w') as f:
            f.write(str(content))
        self.log('Saved file %s' % filename)
        """

        loader = ItemLoader(item=TutorialItem(), response=response)
        loader.add_xpath('title', "//title/text()")
        loader.add_xpath('content', "//div[@class='body textStyle']")
        data = loader.load_item()

        downFile = DownFile(data['content'][0], 'http://www.admin10000.com')
        downFile.downImgFile()

        mongo = Mongo("articles")
        mongo.setTable("admin10000")
        content = data['content'][0]
        # 踢除标签
        strip = StripTags()
        content = strip.filterTags(content)

        article = {'title': data['title'][0], 'content': content}
        mongo.add(article)
Example #13
0
    def parse_book_url(self, response):
        book_item = BookDetails(book_id="", book_type="pdf")
        bil = ItemLoader(item=book_item, response=response)
        bil.add_xpath("book_id",
                      "/*//script/text()",
                      re=r'bookId\s*:\s*(.*),.*')
        bil.add_xpath("book_path",
                      "/*//script/text()",
                      re=r'getDownloadUrl\s*:\s*\"(.*)\".*')
        #bil.get_xpath()
        bil.load_item()
        download_url = self.base_url + book_item['book_path'][0]
        post_data = "book_id=" + book_item['book_id'][
            0] + "&" + "type=" + book_item['book_type']
        #post_data = "book_id=" + "2759" + "&" + "type=" + book_item['book_type']

        #set header
        post_header = {}
        post_header[
            "Content-Type"] = "application/x-www-form-urlencoded; charset=UTF-8"
        post_header["User-Agent"] = "Mozilla/5.0"
        #print post_header
        #print curl_cmd

        yield Request(download_url,
                      self.get_book_link,
                      headers=post_header,
                      method='POST',
                      body=post_data)
Example #14
0
    def process_row(self, row, task):
        stats = self.crawler.stats

        l = ItemLoader(WV_DrillingPermit())
        l.add_value(None, row)
        item = l.load_item()

        if item['API'] and item['permit_activity_type'] and item[
                'permit_activity_date']:
            existing_item = self.db.loadItem(
                item, {
                    'API': item['API'],
                    'permit_activity_type': item['permit_activity_type'],
                    'permit_activity_date': item['permit_activity_date']
                })

            if existing_item:
                stats.inc_value('_existing_count', spider=self)
            else:
                stats.inc_value('_new_count', spider=self)
                yield item

                dt = datetime.strptime(item['permit_activity_date'],
                                       '%Y-%m-%d %H:%M:%S')
                #                if item['permit_activity_type'] in ('Permit Issued', 'Permit Commenced', 'Permit Completed'):
                if item['permit_activity_type'] in (
                        'Permit Issued', 'Permits Issued'
                ) and datetime.now() - dt < timedelta(days=365):
                    for item in self.create_feed_entry(item, task):
                        yield item
Example #15
0
 def parse_item(self, response):
     xpath = './/div[@class="content_left"]'
     sel = response.xpath(xpath)
     if not sel:
         return
     l = ItemLoader(item=HabrahabrItem(), selector=sel, response=response)
     l.add_xpath('title', '//h1/span/text()')
     l.add_xpath('image_urls', '//div[@class="content html_format"]/img/@src')
     comments_items = []
     comments = sel.xpath('//div[starts-with(@class, "message html_format")]').extract()
     for comment in comments:
         comment_item = ItemLoader(item=HabrahabrComment(), selector=sel, response=response)
         comment_item.add_value('comment', comment)
         comments_items.append(comment_item.load_item())
     l.add_value('comments', comments_items)
     yield l.load_item()
Example #16
0
    def get_product_details(self, response):
        crumbs = self.get_breadcrumbs(response)
        loader = ItemLoader(item=VisionsProduct())

        loader.add_value('breadcrumbs', crumbs)
        loader.add_value('url', response.url)

        if isinstance(crumbs, basestring):
            loader.add_value('category', crumbs)

        # Ensure we aren't wasting time extracting from an empty page
        if extract_helper(response, self.EMPTY_PAGE_CHECK):
            for d in self.PRODUCT_DETAILS:
                if '_' not in d.name:  # Don't load price
                    loader.add_value(d.name, 'N/A')
        else:
            productDetails = detailsRunner(self.PRODUCT_DETAILS,
                                           response=response)

            if not productDetails['price']:
                productDetails['price'] = productDetails['price_gif']

            productDetails.pop('price_gif')

            # Fix truncated image urls
            if productDetails['image']:
                productDetails['image'] = add_schema(response.url,
                                                     productDetails['image'])

            for d in productDetails:
                loader.add_value(d, productDetails[d])

        yield loader.load_item()
Example #17
0
 def get_app(self, response):
     il = ItemLoader(item=PlayStoreItems(), response=response)
     il.add_css('app_id', '.details-wrapper::attr(data-docid)')
     il.add_css('name', '.document-title div::text')
     il.add_css('category', '.category span::text')
     il.add_css(
         'category_url', '.category::attr(href)',
         Compose(lambda urls: [urljoin(response.url, url) for url in urls]))
     il.add_css('price', '.details-actions .price span::text')
     il.add_css('offers_in_app_purchases', '.inapp-msg::text')
     il.add_css('stars_count', '.stars-count::text')
     il.add_css('video', '.details-trailer > span::attr(data-video-url)')
     il.add_css('screenshots', '.screenshot::attr(src)')
     il.add_xpath(
         'description',
         '//div[contains(@class, "show-more-content")]/div//text()')
     il.add_css('update_date', '[itemprop="datePublished"]::text')
     il.add_css('file_size', '[itemprop="fileSize"]::text')
     il.add_css('installs', '[itemprop="numDownloads"]::text')
     il.add_css('current_version', '[itemprop="softwareVersion"]::text')
     il.add_css('requires_android', '[itemprop="operatingSystems"]::text')
     il.add_css('offered_by', '[itemprop="author"] > a span::text')
     il.add_css(
         'offered_by_url', '[itemprop="author"] > a::attr(href)',
         Compose(lambda urls: [urljoin(response.url, url) for url in urls]))
     yield il.load_item()
Example #18
0
 def parse(self, response):
     l = ItemLoader(item=JianshuArticleItem(), response=response)
     l.add_xpath(
         'content',
         '//div[@class="article"]/div[@class="show-content"]/p/text()')
     l.add_value('url', response.url)
     return l.load_item()
    def parse_product(self, response):
        p = ItemLoader(item=Product(), response=response)
        p.add_css('nome', 'h1.livedata::text')
        p.add_value('url', response.url)
        p.add_css('descricaoLonga', '.desc-info')
        p.add_css('image',
                  'div.container-product-image a.image-link > img',
                  re='src=[\"|\'](?P<src>[^\"\']+)[\"|\']')
        p.add_css('categorias', 'span[itemprop=title]::text')
        yield p.load_item()


#executar no mongo
#db.produto.remove({'categorias.0': {$exists: false}})
#db.produto.remove({'categorias.0': {$nin: [' Games', ' Livros', ' DVDs e Blu-ray']}})

#deleta produtos duplicados
#var duplicates = [];

#db.produto_novo.aggregate([
#{"$group" : { "_id": "$nome", "count": { "$sum": 1 }, "dups": { "$addToSet": "$_id" },  }},
#{"$match": {"_id" :{ "$ne" : null } , "count" : {"$gt": 1} } }]
#,{allowDiskUse: true},{cursor:{}}
#).result.forEach(function(doc) {
#doc.dups.shift();
#doc.dups.forEach( function(dupId){
#duplicates.push(dupId);
#}
#)
#})
#printjson(duplicates);
#db.produto_novo.remove({_id:{$in:duplicates}})
Example #20
0
 def parsePage(self, response):
     rentHouse = ItemLoader(item = RentItem(), response = response)
     rentHouse.add_value('id', self.name + '-' +
                         response.url.split('/')[-1].split('.')[0])
     rentHouse.add_value('link', response.url)
     rentHouse.add_xpath('title', "//dl[@class = 'title']/dt/p/text()")
     return rentHouse.load_item()
Example #21
0
 def detail(self, response):
     log.msg(response.url)
     hxs = HtmlXPathSelector(response)
     product_name = hxs.xpath(
         '//*[@id="vip_content_section"]/div[2]/h1/text()').extract()
     # //*[@id="vip_content_section"]/div[2]/h1
     if (len(product_name) != 0):
         product_name = hxs.xpath(
             '//*[@id="vip_content_section"]/div[2]/h1/text()').extract()[0]
     product_price = hxs.xpath('//*[@id="price-val"]/text()').extract()
     if (len(product_price) != 0):
         product_price = hxs.xpath(
             '//*[@id="price-val"]/text()').extract()[0]
     if (len(product_price) != 0
             or product_price != None) and (len(product_name)
                                            or product_name != None):
         l = ItemLoader(item=BillionPricesIndiaItem(), response=response)
         l.add_xpath('product_name',
                     '//*[@id="vip_content_section"]/div[2]/h1/text()')
         # l.add_xpath('quantity', '//*[@id="product_detail_view_1"]/div/div[1]/div/text()')
         l.add_xpath('category', '//*[@id="cat_crum"]/@value')
         l.add_xpath('product', '//*[@id="overview_tab"]/div/div/p/text()')
         item = l.load_item()
         item['product_url'] = response.url
         item['price'] = product_price
         item['vendor'] = 'PepperFry'
         item['city'] = 'Mumbai'
         item['state'] = 'Maharashtra'
         item['country'] = 'India'
         item['date'] = str(time.strftime("%d/%m/%Y"))
         return item
Example #22
0
    def parse_item(self, response):
        l = ItemLoader(item=MeizituItem(), response=response)
        l.add_xpath('name', '//h2/a/text()')
        l.add_xpath('tags', "//div[@id='maincontent']/div[@class='postmeta  clearfix']/div[@class='metaRight']/p")
        l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src", Identity())

        l.add_value('url', response.url)
        return l.load_item()
 def parse_item(self, response):
     l = ItemLoader(item=CrawlpictureItem(), response=response)
     l.add_xpath('name', '//h2/a/text()')
     l.add_css('tags', 'div.metaRight p::text')
     #l.add_xpath('image_urls','//div[@id="picture"]/p/img/@src' or '//img[@class="scrollLoading"]/@src',Identity())
     l.add_css('image_urls', 'div.postContent img::attr(src)', Identity())
     l.add_value('url', response.url)
     return l.load_item()
Example #24
0
    def parse_item(self, response):

        for e in response.xpath('//table[@id="basic"]/tbody/tr'):
            l = ItemLoader(ProxyHunterItem(), selector=e)
            l.add_xpath('ip', 'td[2]/a/text()')
            l.add_xpath('port', 'td[3]/text()')
            l.add_xpath('prot', 'td[4]/a/text()')
            yield l.load_item()
Example #25
0
 def parse(self, response):
     items = ItemLoader(item=LiveItem(), response=response)
     for content in response.xpath('//*[@id="sortdetail-container"]/li/a'):
         i = ItemLoader(item=LiveItem(), selector=content)
         #标题
         i.add_xpath('title', 'div[2]/span[1]/text()')
         #用户名
         i.add_xpath('username', 'div[2]/span[2]/@title')
         #热度
         i.add_xpath('num', 'div[2]/span[4]/i/text()')
         #图片的地址
         i.add_xpath('pic_addr', 'div[1]/img/@data-original')
         #直播间的相对地址
         i.add_xpath('addr', '@href')
         #直播平台
         i.add_value('platform', 'panda')
         yield i.load_item()
Example #26
0
 def parse(self, response):
     items = ItemLoader(item=LiveItem(),response=response)
     for content in response.xpath('/html/body/div[3]/div[2]/div/div/div[2]/div/ul/li/a'):
             i = ItemLoader(item=LiveItem(),selector=content)
             #标题
             i.add_xpath('title','span[3]/text()')
             #用户名
             i.add_xpath('username','span[5]/text()')
             #热度
             i.add_xpath('num','span[2]/text()')
             #图片的地址
             i.add_xpath('pic_addr','span[1]/span[1]/img/@data-original')
             #直播间的相对地址
             i.add_xpath('addr','@href')
             #直播平台
             i.add_value('platform','yy')
             yield i.load_item()
Example #27
0
 def parse(self, response):
     items = ItemLoader(item=LiveItem(),response=response)
     for content in response.xpath('//*[@id="live-list-contentbox"]/li/a'):
             i = ItemLoader(item=LiveItem(),selector=content)
             #标题
             i.add_xpath('title','div/div/h3/text()')
             #用户名
             i.add_xpath('username','div[1]/p/span[1]/text()')
             #热度
             i.add_xpath('num','div[1]/p/span[2]/text()')
             #图片的地址
             i.add_xpath('pic_addr','span/img/@data-original')
             #直播间的相对地址
             i.add_xpath('addr','@href')
             #直播平台
             i.add_value('platform','douyu')
             yield i.load_item()
 def parse(self, response):
     item_list = []
     for a in response.css(".menu_box .menu_main h2"):
         l = ItemLoader(item=CategoryItem(), response=response)
         # l.add_css('category', ".menu_box .menu_main h2")
         l.add_value("category", a.extract(), self.get_text)
         item_list.append(l.load_item())
     return item_list
Example #29
0
    def parse_item(self, response):
        l = ItemLoader(item=MeizituItem(), response=response)
        # l.add_xpath('name', '//div[@class="postContent"]/div[@id="picture"]/p/a/text()')
        # l.add_xpath('tags', '//div[@class="postContent"]')
        l.add_xpath('img_url', '//div[@class="text"]/p/br/img/@src', Identity())
        l.add_value('url', response.url)

        return l.load_item()
 def _set_loader(self, response, xs, item):
     if not xs:
         self.from_detail_page = True
         item = response.request.meta['item']
         if self.scraper.detail_page_content_type == 'J':
             json_resp = json.loads(response.body_as_unicode())
             self.loader = JsonItemLoader(item=item, selector=json_resp)
         else:
             self.loader = ItemLoader(item=item, response=response)
     else:
         self.from_detail_page = False
         if self.scraper.content_type == 'J':
             self.loader = JsonItemLoader(item=item, selector=xs)
         else:
             self.loader = ItemLoader(item=item, selector=xs)
     self.loader.default_output_processor = TakeFirst()
     self.loader.log = self.log