Example #1
0
 def parse_detail(self, response):
     url = response.url
     item = ItemLoader(item=MeizituItem(), response=response)
     item.add_xpath("title", "//h2/a/text()")
     item.add_xpath("image_urls", '//div[@id="picture"]//img/@src')
     item.add_value('url', url)
     return item.load_item()
Example #2
0
    def parse_items(self, response):
        item = ItemLoader(Articulos(), response)     
        item.add_xpath('title', '//*[@id="MainContainer"]/article/section[1]/div[1]/div/h2/text()')
        item.add_xpath('description', '//*[@id="MainContainer"]/article/section[1]/div[2]/ul/li[3]/text()')
        yield item.load_item()

# scrapy runspider multiplepages.py -o ../../resources/computrabajo.csv -t csv
Example #3
0
 def parse_item(self, response):
     # 解析http://www.meizitu.com/a/5336.html获取图片URL
     l = ItemLoader(item=MeizituItem(), response=response)
     l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src",
                 Identity())
     l.add_value('url', response.url)
     return l.load_item()
Example #4
0
    def parse_by_product(self, response):
        """
        For the 'Bundles' category, grab the product details for the first
        product listed.
        """
        self.selector = Selector(response)
        self.results = self.selector.xpath('//*[@id="ctl00_tdMainPanel"]')
        loader = ItemLoader(item = VisionsProduct(),
                            selector = self.results[0])

        self.field_xpaths = {
            'product': ('div[contains(@class, "catalogueTitle")]'
                        '/h3/text()'),
            'price': ('div[@id="ctl00_ContentPlaceHolder1_pnl'
                      'Bundle"]/div[@id="divProductDetails"]/div'
                      '[contains(@class, "priceAddToCart")]/div[1]/span'
                      '[contains(@id, "SalePrice")]/text()')
        }

        # Extract and load product details
        loader.add_xpath('product', self.field_xpaths['product'])
        loader.add_xpath('price', self.field_xpaths['price'],
                         re = '\$[\d]*[,]*[\d]*\.[\d]*')
        loader.add_value('availability', 'Not Limited/Clearance Item')

        # Because it's an individual product page, manually set the category
        self.category = '/'.join(['Home', response.url.split('/')[4]])
        loader.add_value('category', self.category)

        yield loader.load_item()
Example #5
0
 def get_app(self, response):
     il = ItemLoader(item=PlayStoreItems(), response=response)
     il.add_css('app_id', '.details-wrapper::attr(data-docid)')
     il.add_css('name', '.document-title div::text')
     il.add_css('category', '.category span::text')
     il.add_css(
         'category_url', '.category::attr(href)',
         Compose(lambda urls: [urljoin(response.url, url) for url in urls]))
     il.add_css('price', '.details-actions .price span::text')
     il.add_css('offers_in_app_purchases', '.inapp-msg::text')
     il.add_css('stars_count', '.stars-count::text')
     il.add_css('video', '.details-trailer > span::attr(data-video-url)')
     il.add_css('screenshots', '.screenshot::attr(src)')
     il.add_xpath(
         'description',
         '//div[contains(@class, "show-more-content")]/div//text()')
     il.add_css('update_date', '[itemprop="datePublished"]::text')
     il.add_css('file_size', '[itemprop="fileSize"]::text')
     il.add_css('installs', '[itemprop="numDownloads"]::text')
     il.add_css('current_version', '[itemprop="softwareVersion"]::text')
     il.add_css('requires_android', '[itemprop="operatingSystems"]::text')
     il.add_css('offered_by', '[itemprop="author"] > a span::text')
     il.add_css(
         'offered_by_url', '[itemprop="author"] > a::attr(href)',
         Compose(lambda urls: [urljoin(response.url, url) for url in urls]))
     yield il.load_item()
Example #6
0
    def parse_list(self,response):
        #Get seller attributes
        sel = Selector(response)
        self.page += 1
        for s in sel.xpath(Seller.base_xpath):
            seller_loader = ItemLoader(Seller(),selector=s)
            # iterate over fields and add xpaths to the seller_loader
            seller_loader.add_value('page',self.page)
            seller_loader.add_value('flag','Seller')
            for key,value in Seller.item_fields.iteritems():
                seller_loader.add_xpath(key,value)
            yield seller_loader.load_item()

        #Get commodity attributes
        for s in sel.xpath(Commodity.base_xpath):
            comm_loader = ItemLoader(Commodity(),selector=s)
            comm_loader.add_value('page',self.page)
            comm_loader.add_value('flag','Commodity')
            for key,value in Commodity.item_fields.iteritems():
                comm_loader.add_xpath(key,value)
            yield comm_loader.load_item()


        #Next page
        if(sel.xpath(self.next_page_xpath)):
            yield Request("http://spu.taobao.com/spu/3c/detail.htm" +
                    sel.xpath(self.next_page_xpath).extract()[0],
                    callback=self.parse_list)
Example #7
0
    def parse_book_url(self, response):
        book_item = BookDetails(book_id="", book_type="pdf")
        bil = ItemLoader(item=book_item, response=response)
        bil.add_xpath("book_id",
                      "/*//script/text()",
                      re=r'bookId\s*:\s*(.*),.*')
        bil.add_xpath("book_path",
                      "/*//script/text()",
                      re=r'getDownloadUrl\s*:\s*\"(.*)\".*')
        #bil.get_xpath()
        bil.load_item()
        download_url = self.base_url + book_item['book_path'][0]
        post_data = "book_id=" + book_item['book_id'][
            0] + "&" + "type=" + book_item['book_type']
        #post_data = "book_id=" + "2759" + "&" + "type=" + book_item['book_type']

        #set header
        post_header = {}
        post_header[
            "Content-Type"] = "application/x-www-form-urlencoded; charset=UTF-8"
        post_header["User-Agent"] = "Mozilla/5.0"
        #print post_header
        #print curl_cmd

        yield Request(download_url,
                      self.get_book_link,
                      headers=post_header,
                      method='POST',
                      body=post_data)
Example #8
0
File: Xs.py Project: kevking/xs
 def parse(self, response):
     items = ItemLoader(item=XsContentItem(), response=response)
     #章节标题
     items.add_xpath('title', '//*[@class="bookname"]/h1/text()')
     #正文
     items.add_xpath('text', '//*[@id="content"]/text()')
     yield items.load_item()
Example #9
0
    def parse(self, response):
        for sel in response.css("ul#channels-browse-content-grid > li"):
            loader = ItemLoader(YoutubeVideo(), selector=sel)
 
            loader.add_xpath('link', './/h3/a/@href')
            
            yield loader.load_item()
Example #10
0
 def parse(self, response):
     l = ItemLoader(item=JianshuArticleItem(), response=response)
     l.add_xpath(
         'content',
         '//div[@class="article"]/div[@class="show-content"]/p/text()')
     l.add_value('url', response.url)
     return l.load_item()
Example #11
0
 def parsePage(self, response):
     rentHouse = ItemLoader(item = RentItem(), response = response)
     rentHouse.add_value('id', self.name + '-' +
                         response.url.split('/')[-1].split('.')[0])
     rentHouse.add_value('link', response.url)
     rentHouse.add_xpath('title', "//dl[@class = 'title']/dt/p/text()")
     return rentHouse.load_item()
Example #12
0
    def parse(self, response):

        content = response.body
        page = response.url.split("/")[-1]
        """
        content = Selector(response=response).xpath("//div[@class='body textStyle']").extract()
        if (len(content)):
            content = content[0]
        #踢除标签
        strip = StripTags()
        content = strip.filterTags(content)
        #写文件
        filename = 'quotes-%s' % page
        with open(filename, 'w') as f:
            f.write(str(content))
        self.log('Saved file %s' % filename)
        """

        loader = ItemLoader(item=TutorialItem(), response=response)
        loader.add_xpath('title', "//title/text()")
        loader.add_xpath('content', "//div[@class='body textStyle']")
        data = loader.load_item()

        downFile = DownFile(data['content'][0], 'http://www.admin10000.com')
        downFile.downImgFile()

        mongo = Mongo("articles")
        mongo.setTable("admin10000")
        content = data['content'][0]
        # 踢除标签
        strip = StripTags()
        content = strip.filterTags(content)

        article = {'title': data['title'][0], 'content': content}
        mongo.add(article)
Example #13
0
 def parse(self, response):
     l = ItemLoader(item=MyItem(), response=response)
     l.add_xpath(
         "title",
         """//div[@class="carousel"]/div[@class="songlist-slides slide-page"]/ul[@class="list-songlist slide-item"]/li[@class="songlist-item"]/a[@class="lnk-songlist"]/@title""",
     )
     return l.load_item()
Example #14
0
    def parse_item(self, response):
        sel = Selector(response)

        # collect xpaths of each player (row in table)
        rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr')

        # prepare to adjust for shootout stats if necessary
        shootout = 0
        if self.year > 2005:
            shootout = 1

        # loop through players
        for row in rows:
            loader = ItemLoader(SkatEngItem(), selector=row)
            loader.default_input_processor = MapCompose()
            loader.default_output_processor = Join()

            # get unique NHL ID number from player's page URL
            num = row.xpath("td[2]/a/@href").extract()
            sNum = num[0][-7:]
            loader.add_value("nhl_num", sNum)

            # add season data
            loader.add_value("season", str(self.year))

            # collect stats
            if shootout:
                loader.add_xpath("en_goals", ".//td[20]/text()")
                loader.add_xpath("ps_goals", ".//td[21]/text()")
            else:
                loader.add_xpath("en_goals", ".//td[21]/text()")
                loader.add_xpath("ps_goals", ".//td[22]/text()")

            # feed item to pipeline
            yield loader.load_item()
Example #15
0
    def parse_review(self, response):
        sel = Selector(response)

        if not self._is_right_category(sel):
            self.log('Skip URL: %s' % response.url, level=log.INFO)
            return

        self.log('Parse URL: %s' % response.url, level=log.INFO)

        loader = ItemLoader(item=YelpReview(), selector=sel)
        loader.add_value('crawl_date', '%s' % datetime.utcnow())
        loader.add_value('page_url', response.url)

        # Loop over all the fields we need to extract.
        for field, selector in self._item_selectors.iteritems():
            loader.add_xpath(field, selector)

        master_review = loader.load_item()
        review_selectors = sel.xpath('//div[contains(@class, "review")][@itemprop="review"]')

        for rev_sel in review_selectors:
            review_loader = ItemLoader(item=master_review.copy(), selector=rev_sel)

            for field, selector in self._review_selectors.iteritems():
                review_loader.add_xpath(field, selector)

            yield review_loader.load_item()

        return
Example #16
0
    def parse_item(self, response):

        for e in response.xpath('//table[@id="basic"]/tbody/tr'):
            l = ItemLoader(ProxyHunterItem(), selector=e)
            l.add_xpath('ip', 'td[2]/a/text()')
            l.add_xpath('port', 'td[3]/text()')
            l.add_xpath('prot', 'td[4]/a/text()')
            yield l.load_item()
Example #17
0
    def parse_item(self, response):
        l = ItemLoader(item=MeizituItem(), response=response)
        # l.add_xpath('name', '//div[@class="postContent"]/div[@id="picture"]/p/a/text()')
        # l.add_xpath('tags', '//div[@class="postContent"]')
        l.add_xpath('img_url', '//div[@class="text"]/p/br/img/@src', Identity())
        l.add_value('url', response.url)

        return l.load_item()
 def parse_item(self, response):
     l = ItemLoader(item=CrawlpictureItem(), response=response)
     l.add_xpath('name', '//h2/a/text()')
     l.add_css('tags', 'div.metaRight p::text')
     #l.add_xpath('image_urls','//div[@id="picture"]/p/img/@src' or '//img[@class="scrollLoading"]/@src',Identity())
     l.add_css('image_urls', 'div.postContent img::attr(src)', Identity())
     l.add_value('url', response.url)
     return l.load_item()
Example #19
0
    def parse_item(self, response):
        self.logger.info("parse_item url %s" % response.url)
        l = ItemLoader(item=ImgDownloadItem(), response=response)
        l.add_xpath('name', '//h1[@class="article-title"]/a/text()')
        # l.add_xpath('tags', "//div[@id='maincontent']/div[@class='postmeta  clearfix']/div[@class='metaRight']/p")
        l.add_xpath('image_urls', "//article[@class='article-content']/p/img/@src", Identity())

        l.add_value('url', response.url)
        return l.load_item()
Example #20
0
 def parse_item(self, response):
     l = ItemLoader(item=CoserItem(), response=response)
     l.add_xpath('name', "//h1[@class='js-post-title']/text()")
     l.add_xpath('info', "//div[@class='post__info']/div[@class='post__type post__info-group']/span/text()")
     urls = l.get_xpath('//img[@class="detail_std detail_clickable"]/@src')
     urls = [url.replace('/w650', '') for url in urls]
     l.add_value('image_urls', urls)
     l.add_value('url', response.url)
     return l.load_item()
Example #21
0
 def parse_final(self, response):
     # sel = Selector(response)
     # item = MeiziScrapyItem()
     # item["image_name"] = sel.xpath('/html/body/div[2]/div[1]/h2/text()').extract()[0]
     # return item
     l = ItemLoader(item=MeiziScrapyItem(), response=response)
     l.add_xpath('image_name', '/html/body/div[2]/div[1]/h2/text()')
     l.add_xpath('image_url', '//*[@id="content"]/a/img/@src')
     return l.load_item()
    def parse_template(self, response):
        """
        Callback used by Scrapy to process downloaded responses
        //*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[4]/table/tbody/tr[10]/td[2]
        """
        response_body = response.body_as_unicode()

        # Checking if coffee beans are present in the source, since it shifts down the divs
        coffee = True if 'cups of coffee' in response_body else False

        prop_xpath = '//div[@class="info_wrapper"]//tr[td[@class="key"]/strong/text() = "{}:"]/td[@class="value"]/text()'
        substr_xpath = 'substring-after(normalize-space({}), "{}")'

        item_fields = {
            'item_hash':
            '//*[@id="offer_sku"]/text()',
            'title':
            '//*[@id="thing_name"]/text()',
            'thumbnail':
            '//*[@id="thing_image"]/@src',
            'description':
            '//*[@id="description"]',
            'creator':
            '//*[@id="product_manufacturer"]/text()',
            'when':
            prop_xpath.format('Released'),
            'bootstrap_version':
            substr_xpath.format(prop_xpath.format('Bootstrap'),
                                'Compatible with '),
            'cost_single':
            substr_xpath.format(
                '//*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[{}]/div[1]//span/text()'
                .format(3 if coffee else 2), '$'),
            'cost_multiple':
            substr_xpath.format(
                '//*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[{}]/div[2]//span/text()'
                .format(3 if coffee else 2), '$'),
            'cost_extended':
            substr_xpath.format(
                '//*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[{}]/div[3]//span/text()'
                .format(3 if coffee else 2), '$'),
            'purchases':
            '//div[@class="purchases"]/span[@class="count"]/text()',
        }

        selector = Selector(response)

        loader = ItemLoader(WrapBootstrapTemplate(), selector=selector)

        # define processors
        loader.default_input_processor = MapCompose(unicode.strip)
        loader.default_output_processor = Join()

        # iterate over fields and add xpaths to the loader
        for field, xpath in item_fields.iteritems():
            loader.add_xpath(field, xpath)
        yield loader.load_item()
Example #23
0
 def parse(self, response):
     l = ItemLoader(item=CoserItem(), response=response)
     l.add_xpath('name', "//h1[@class='js-post-title']/text()")
     l.add_xpath('info', "//div[@class='post__info']/div[@class='post__type post__info-group']/span/text()")
     urls = l.get_xpath('//img[@class="detail_std detail_clickable"]/@src')
     urls = [url.replace('/w650', '') for url in urls]
     l.add_value('image_urls', urls)
     l.add_value('url', response.url)
     return l.load_item()
Example #24
0
    def parse(self, response):
        sel = Selector(response)
        articulos = sel.xpath('/html/body/div[2]/div/div/div/div[1]/div[3]/div')

        for i, elem in enumerate(articulos):
            item = ItemLoader(Articulos(), elem)
            item.add_xpath('title', './/h3/text()')
            item.add_value('id', i)
            yield item.load_item()
Example #25
0
    def parse(self, response):

        for e in response.xpath(
                '//table[@id="tbl_proxy_list"]//tr[count(td)=6]'):
            l = ItemLoader(ProxyHunterItem(), selector=e)
            l.add_value('prot', 'http')
            l.add_xpath('ip', 'td[1]', TakeFirst(), remove_tags, unicode.strip)
            l.add_xpath('port', 'td[2]', TakeFirst(), remove_tags,
                        unicode.strip)
            yield l.load_item()
Example #26
0
 def parse_item2(self, response):
     l = ItemLoader(item=DmozItem(), response=response)
     l.add_xpath(
         'type',
         '//div[@class="location ask_main_location"]/span[@class="fl"]/a[last()]/text()'
     )
     l.add_xpath('type', '//div[@class="question"]/h2/text()')
     l.add_xpath('answer', '//div[@class="anwser"]/h2/text()')
     l.add_value('answer', '牛逼')
     yield l.load_item()
Example #27
0
    def parse(self, response):
        sel = Selector(response)
        preguntas = sel.xpath(
            '//div[@id="question-mini-list"]/div')  #es una lista

        for i, elem in enumerate(preguntas):
            item = ItemLoader(Pregunta(), elem)  #elem tiene el xpath
            item.add_xpath('pregunta', './/h3/a/text()')
            item.add_value('id', i)
            yield item.load_item()
Example #28
0
def Loader_index(self, item_selector):
    l = ItemLoader(item={}, selector=item_selector)

    conver_img = l.get_xpath('.//*[@class="lz_img"]/img/@src')

    l.add_xpath('title', './/*[@class="k_list-lb-2"]/div[1]/a[1]/text()')
    l.add_xpath('url', './/*[@class="k_list-lb-2"]/div[1]/a/@href')
    l.add_value('preview', conver_img)
    l.add_css('date', '#k_list-lb-2-f::text', re=r'(\d{4}-\d{2}-\d{2})')
    l.add_value('image_urls', conver_img)
    return l.load_item()
Example #29
0
 def parse_item(self, response):
     t_cl_item=response.xpath('//table[@class="item"]')
     
     for t in t_cl_item:
         l = ItemLoader(item=SSDItem(), selector=t)
         l.add_xpath('name', './/td[@class="l"]/a/text()')
         l.add_xpath('usual_price', './/div[@class="price"]/text()')
         l.add_xpath('club_price', './/div[@class="price club_price"]/text()')
         l.add_xpath('img_url', './/div[@class="photo"]/span/img/@src')
         l.add_xpath('url', './/td[@class="l"]/a/@href')
         yield l.load_item()
Example #30
0
    def parse(self, response):
        sel = Selector(response)
        lugares = sel.xpath('//div[@id=hotellist_inner]/div')

        #vamos a iterar sobre todas las preguntas

        for i, elem in enumerate(lugares):
            item = ItemLoader(Lugar(), elem)
            item.add_xpath('lugar', './/h3/a/span/text()')
            item.add_value('id', i)
            yield item.load_item()
Example #31
0
    def parse(self, response):
        direction = response.xpath('//li[@class="btn-schedules-active"][1]/text()').extract()
        day = response.xpath('//li[@class="btn-schedules-active"][2]/text()').extract()
        for sel in response.xpath('//tr'):
            loader = ItemLoader(item = RtdItem(), selector = sel)
            loader.default_output_processor = TakeFirst()

            loader.add_value('day', day)
            loader.add_value('direction', direction)
            loader.add_xpath('route', 'th/a/text()')
            loader.add_xpath('depart_time', 'td[1]/text()')
            loader.add_xpath('arrive_time', 'td[2]/text()')
            yield loader.load_item()
Example #32
0
    def parse_detail(self, response):
        print("response.url===", response.url)
        #具体值
        url = response.url

        #使用ItemLoader类
        item = ItemLoader(item=Meizitu2Item(), response=response)
        item.add_xpath("tilte", "//h2/a/text()")
        item.add_xpath("image_urls", '//div[@id="picture"]//img/@src')
        #添加值的方式
        item.add_value("url", url)

        return item.load_item()
Example #33
0
    def parse_content(self, response):

        goods_loader = ItemLoader(item=AlibbItem(), response = response)
        url = str(response.url)
        goods_loader.add_value('url', url)
        goods_loader.add_value('url_hash',hashlib.sha1(url).hexdigest())
        goods_loader.add_xpath('name', self._x_query['title'].encode('utf-8'))

        # detail data
        iDetailDataPattern=re.compile("iDetailData.*};",re.DOTALL)
        detail_data_list=response.xpath('//script').re(iDetailDataPattern)
        detail_data=detail_data_list[0].replace("iDetailData = {","{")
        detail_data=detail_data.replace("};","}")
        detail_data=detail_data.replace("\t|\n|\\","")


        detail_data_json=json.loads(detail_data)
        if len(detail_data_json)!=0:
            properties=detail_data_json['sku']['skuMap'].keys()
            goods_loader.add_value('properties',[property.replace(">",",") for property in properties])

            for attribute in detail_data_json['sku']['skuProps']:
                attributes={}
                options=[value['name'] for value in attribute['value']]
                attributes['name']=attribute['prop']
                attributes['options']=options
                goods_loader.add_value('attributes',attributes)
        else:
            goods_loader.add_value('attributes',"")

        price=response.xpath('//span[re:test(@class,"value price-length-\d$")]/text()').extract()
        goods_loader.add_value('price',price[0] if len(price)>0 else detail_data_json['sku']['price'])

        # detail information
        detail_info_list=response.xpath(self._x_query['detail_info']).extract()
        goods_loader.add_value('parameters', [list(info_list) for info_list in zip(detail_info_list[::2],detail_info_list[1::2])])
        print goods_loader.load_item()['url']



        # profile img
        profile_img_urls=response.xpath('//li/@data-imgs').re("original.*jpg")

        for urls in profile_img_urls:
            profile_img_url=urls.replace("original\":\"http","http")
            goods_loader.add_value("boothes",profile_img_url)


        # big img
        for link in response.xpath('//*[@id="desc-lazyload-container"]/@data-tfs-url').extract():
            yield Request(url = link, meta={'item': goods_loader},callback=self.parse_content_down)
Example #34
0
 def parse_item(self, response):
     sel = Selector(response)
     
     # collect xpaths of each player (row in table)
     rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr')
     
     # instantiate parsing variables
     MONTHS = {'Jan': '01',
               'Feb': '02',
               'Mar': '03',
               'Apr': '04',
               'May': '05',
               'Jun': '06',
               'Jul': '07',
               'Aug': '08',
               'Sep': '09',
               'Oct': '10',
               'Nov': '11',
               'Dec': '12'}
     
     # loop through players
     for row in rows:
         loader = ItemLoader(GoalBioItem(), selector=row)
         loader.default_input_processor = MapCompose()
         loader.default_output_processor = Join()
         
         # get unique NHL ID number from player's page URL
         num = row.xpath('td[2]/a/@href').extract()
         sNum = num[0][-7:]
         loader.add_value('nhl_num', sNum)
         
         # parse the name
         name = row.xpath('td[2]/a/text()').extract()
         sName = name[0].split(' ', 1)
         loader.add_value('first_name', sName[0])
         loader.add_value('last_name', sName[1])
         
         # collect birth year
         bDate = row.xpath('td[4]/text()').extract()[0]
         bYear = "19" + bDate[-2:]
         bMonth = MONTHS[bDate[:3]]
         bDay = bDate[4:6]
         loader.add_value('birthday', "%s-%s-%s" % (bYear, bMonth, bDay))
         
         # add other data points
         loader.add_value('position', 'G')
         loader.add_xpath('draft_year', './/td[12]/text()')
         loader.add_xpath('draft_position', './/td[14]/text()')
         
         # feed item to pipeline
         yield loader.load_item()
Example #35
0
    def parse_item(self, response):
        sel = Selector(response)

        # collect xpaths of each player (row in table)
        rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr')

        # loop through players
        for row in rows:
            loader = ItemLoader(SkatPMItem(), selector=row)
            loader.default_input_processor = MapCompose()
            loader.default_output_processor = Join()

            # get unique NHL ID number from player's page URL
            num = row.xpath("td[2]/a/@href").extract()
            sNum = num[0][-7:]
            loader.add_value("nhl_num", sNum)

            # add season data
            loader.add_value("season", str(self.year))

            # collect stats
            loader.add_xpath("team_goals_for", ".//td[14]/text()")
            loader.add_xpath("team_pp_goals_for", ".//td[15]/text()")
            loader.add_xpath("team_goals_against", ".//td[16]/text()")
            loader.add_xpath("team_pp_goals_against", ".//td[17]/text()")

            # feed item to pipeline
            yield loader.load_item()
Example #36
0
    def parse_item(self, response):
        # l=用ItemLoader载入MeizituItem()
        l = ItemLoader(item=MeizituItem(), response=response)
        # 名字
        #      l.add_xpath('name', '//h2/a/text()')
        # 标签
        #      l.add_xpath('tags', "//div[@id='maincontent']/div[@class='postmeta  clearfix']/div[@class='metaRight']/p")
        # 图片连接
        l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src",
                    Identity())
        # url
        l.add_value('url', response.url)

        return l.load_item()
Example #37
0
    def parse_parts2(self, response):
        log.msg("\tparse_parts time: %s" % int(time.time()), level=log.DEBUG)
        ua = response.request.headers['User-Agent']
        log.msg("\tua: %s" % ua, level=log.DEBUG)

        for part in response.css('table.parts > tbody > tr'):
            il = ItemLoader(item=CarPart(), selector=part)
            il.add_xpath('shop_city', "td[@class='shop']/a/text()")
            il.add_xpath('shop_name', "td[@class='shop']/a/strong/text()")

            shop_url = il.get_xpath("td[@class='shop']/a/@href", TakeFirst())
            photo_url = il.get_xpath("td[@class='photo']/a/@href", TakeFirst())
            il.add_value('shop_url', urljoin(self.main_url, shop_url))
            il.add_value('ext_link', urljoin(self.main_url, photo_url))

            il.add_xpath('info', "td[@class='info']//text()")
            il.add_xpath('price', "td[@class='price']//text()")

            il.add_value('brand', response.meta.get('brand'))
            il.add_value('model', response.meta.get('model'))
            il.add_value('car_part', response.meta.get('car_part'))
            il.add_value('category', response.meta.get('category'))

            item = il.load_item()
            if item.is_valid():
                yield item
Example #38
0
 def parse_item(self, response):
     l = ItemLoader(item=CoserItem(), response=response)
     l.add_xpath('name', "//div[@class='mb10 dib']/a/text()")
     l.add_xpath('info', "//div/p[@class='mb20']/text()")
     #l.add_xpath('image_urls',"//div[@class='content-img-wrap-inner']/img[@src]")
     l.add_value('url', response.url)
     # //div[@class='content-img-wrap']//img/@src
     # 抓不到,正则还是牛逼
     urls = l.selector.re(r'src="(.+?.jpg)/w650')
     # urls = l.get_xpath("//div[@class='content-img-wrap']//img/@src")
     # urls = [url.replace('/w650', '') for url in urls]
     l.add_value('image_urls', urls)
     # l.add_xpath('image_urls',"//div/p[@class='mb20']/text()")
     yield l.load_item()
Example #39
0
    def parse(self,response):
        l = ItemLoader(item = timeItem(),response = response)
        #l.add_xpath('topnews','//*[@id="article-container"]/div/div[1]/section/div/article[*]/div/p/text()')
        l.add_xpath('topnews','//*[@id="article-container"]/div/div[1]/section/div/article[*]/div/h2/a/text()')
        l.add_xpath('topnews','//*[@id="article-container"]/div/div[1]/section/div/article[1]/div/div/div[2]/div[*]/h3/a/text()')
        l.add_xpath('sectionnews','//a[contains(@class,"home-columnists-title")]/text()')
        l.add_xpath('sectionnews','//a[contains(@data-event,"hp-news")]/text()')
        x = l.load_item()

        nytdict = dict()
        datelist = []
        datalist = datetime.date.today()
        topnewslist = []
        sectionnewslist = []
        nytdict['date'] = str(datalist)

        for t in x['topnews']:
            topnewslist.append(str(t.encode('ascii','ignore')).strip())
        nytdict['topnews']=topnewslist

        for t in x['sectionnews']:
            sectionnewslist.append(str(t.encode('ascii','ignore')).strip())
        nytdict['sectionnews']=sectionnewslist

        filename = datetime.date.today()
        f=open('{}.json'.format(filename),'w')
        json.dump(nytdict,f)
        return l.load_item()
Example #40
0
 def parse(self, response):
     items = []
     for everyday in response.xpath('//ul/li/strong/a'):
         loader = ItemLoader(ProductItem(), everyday)
         loader.default_input_processor = MapCompose(unicode.strip)
         loader.default_output_processor = Join()
         loader.add_xpath('name', 'text()')
         loader.add_xpath('price', '@href')
         loader.add_xpath('stock', '@mon')
         loader.add_value('last_updated', 'today')  # you can also use literal values
         item = self.to_utf8(loader.load_item(), *['name', 'price', 'stock', 'last_updated'])
         self.log(item['name'], log.INFO)
         items.append(item)
     return items
    def parse_item(self, response):
        """This method will not populate such fields:
        locality, mobile_number, country, email
        """
        il = ItemLoader(item=UKBusinessItem(), response=response)

        il.add_value('url', unicode(response.url))
        il.add_xpath('name', '//h3[@class="biz"]/text()')
        il.add_xpath('category', '//div[@id="breadcrumbs"]/a[2]/text()')

        bcon_list = response.xpath('//ul[@class="bcon"]/li')
        for li in bcon_list:
            li_text = cond_set_value(li.xpath('.//b/text()').extract())
            if li_text == 'Tel:':
                phone_number = cond_set_value(li.xpath('text()').extract())
                il.add_value('phone_number', phone_number)
            if li_text == 'Web:':
                website = cond_set_value(li.xpath('.//a/text()').extract())
                il.add_value('website', website)
            if li_text == 'Fax:':
                fax_number = cond_set_value(li.xpath('text()').extract())
                il.add_value('fax_number', fax_number)

        address_list = response.xpath('//ul[@class="bad"]/li/text()').extract()
        if address_list:
            address_without_postal_code = u', '.join(address_list[:-1])
            postal_code = address_list[-1]
            il.add_value('address', address_without_postal_code)
            il.add_value('postal_code', postal_code)

        il.add_xpath('latitude', '//div[@id="lat"]/text()')
        il.add_xpath('longitude', '//div[@id="lng"]/text()')

        return il.load_item()
Example #42
0
    def parse_product(self, response):
        product_url = response.url
       # sel = self.selenium
        #sel.open(response.url)
        #time.sleep(2.5)


        selector = Selector(response)

        # //*[@id="product_detail_view_1"]/div/div[6]/div[2]/span[2]
        price = selector.xpath('//*[@id="product_detail_view_1"]/div/div[7]/div[2]/span[2]/text()').extract()
        if not price:
            price = selector.xpath('//*[@id="product_detail_view_1"]/div/div[6]/div[2]/span[2]/text()').extract()
        if not price:
            price = selector.xpath(
                '//*[@id="product_detail_view_1"]/div/div[5]/div[2]/span[2]/text()').extract()
        if not price:
            price = selector.xpath(
                '//*[@id="product_detail_view_1"]/div/div[4]/div[2]/span[2]/text()').extract()
        l = ItemLoader(item=BillionPricesIndiaItem(), response=response)
        l.add_xpath('product_name', '//*[@id="inner"]/div[1]/div[1]/div/div/text()')
        l.add_xpath('quantity', '//*[@id="product_detail_view_1"]/div/div[1]/div/text()')
        l.add_xpath('category', '//*[@id="inner"]/div[1]/div[1]/div/a[1]/text()')
        l.add_xpath('product', '//*[@id="inner"]/div[1]/div[1]/div/a[2]/text()')
        item = l.load_item()
        item['product_url'] = product_url
        item['price'] = price
        item['vendor'] ='Local Banya'
        item['city'] = 'Mumbai'
        item['state'] = 'Maharashtra'
        item['country'] = 'India'
        item['date']=str(time.strftime("%d/%m/%Y"))


        return item
Example #43
0
	def parse(self, response):
		selector = response.selector.xpath(view)

		#iterate over titles
		for page in selector.select(self.view):
			loader = ItemLoader(AmazonItem(), page)

			#define processors
			loader.default_input_processor = MapCompose(unicode.strip)
			loader.default_output_processor = Join()

			#iterate over fields and add xpaths to the loader
			for field, xpath in self.iem_fields.iteritems():
				loader.add_xpath(field, xpath)
			return loader.load_item()
Example #44
0
    def parse_item(self, response):

        # sel = Selector(response)
        #
        # name = sel.xpath("//div[@class='main-image']/p/a/img/@alt").extract()[0]
        # print(name)

        l = ItemLoader(item=MzituItem(), response=response)
        l.add_xpath('image_urls', "//div[@class='main-image']/p/a/img/@src",
                    Identity())
        l.add_xpath('name', "//div[@class='main-image']/p/a/img/@alt",
                    Identity())
        # l.add_value('name', name)

        return l.load_item()
Example #45
0
 def parse_detail(self, response):
     l = ItemLoader(response.meta['item'], response)
     # l.add_xpath('fanhao','//span[@class="list_text"]/em/b/a/text()')
     l.add_xpath('image_name', '//span[@class="list_text"]/em/b/a/text()')
     photo = response.xpath(
         '//span[@class="list_img"]/a/img/@data-original').extract()
     # item = response.meta['item']
     # item['fanhao'] = selector.xpath('//span[@class="list_text"]/em/b/a/text()').extract()
     # photo = selector.xpath('//span[@class="list_img"]/a/img/@data-original').extract()
     img = []
     for p in photo:
         img.append('http://www.nh87.cn' + p)
     l.add_value('image_urls', img)
     # 返回item
     return l.load_item()
Example #46
0
 def parse_item(self, response):
     xpath = './/div[@class="content_left"]'
     sel = response.xpath(xpath)
     if not sel:
         return
     l = ItemLoader(item=HabrahabrItem(), selector=sel, response=response)
     l.add_xpath('title', '//h1/span/text()')
     l.add_xpath('image_urls', '//div[@class="content html_format"]/img/@src')
     comments_items = []
     comments = sel.xpath('//div[starts-with(@class, "message html_format")]').extract()
     for comment in comments:
         comment_item = ItemLoader(item=HabrahabrComment(), selector=sel, response=response)
         comment_item.add_value('comment', comment)
         comments_items.append(comment_item.load_item())
     l.add_value('comments', comments_items)
     yield l.load_item()
Example #47
0
 def parse(self, response):
     items = ItemLoader(item=LiveItem(), response=response)
     for content in response.xpath('//*[@id="sortdetail-container"]/li/a'):
         i = ItemLoader(item=LiveItem(), selector=content)
         #标题
         i.add_xpath('title', 'div[2]/span[1]/text()')
         #用户名
         i.add_xpath('username', 'div[2]/span[2]/@title')
         #热度
         i.add_xpath('num', 'div[2]/span[4]/i/text()')
         #图片的地址
         i.add_xpath('pic_addr', 'div[1]/img/@data-original')
         #直播间的相对地址
         i.add_xpath('addr', '@href')
         #直播平台
         i.add_value('platform', 'panda')
         yield i.load_item()
    def parse_item(self, response):
        """Fields not populated by this method:
        locality, country, mobile_number, category.
        """
        il = ItemLoader(item=UKBusinessItem(), response=response)
        il.add_value('url', unicode(response.url))
        il.add_xpath('name', '//div[@class="company_details"]/h3/text()')

        address_p =response.xpath(
            '//div[@class="fl company_details_inset1"]/p')
        address_first_part = address_p.xpath(
            'text()[normalize-space()]').extract()
        address_first_part = [ad.strip().replace('\\n\\r', '') for ad
                             in address_first_part]
        address_second_part = cond_set_value(
            address_p.xpath(
                './/a[contains(@href, "town")]/text()').extract(), '')
        address = ', '.join(address_first_part) + ', '+ address_second_part
        if address == ', ':
            return
        il.add_value('address', address)

        postal_code = ''.join(
            address_p.xpath(
                './/a[not(contains(@href, "town"))]/text()').extract())
        il.add_value('postal_code', postal_code)

        il.add_xpath('website', './/div[@class="company-website "]/a/text()')
        il.add_xpath('fax_number', './/span[@class="company-fax"]/text()')
        il.add_xpath('phone_number', './/span[@class="company-phno"]/@phone')

        mail_a = response.xpath('.//a[@value="Contact us"]/text()').extract()
        try:
            mail_a.remove('Contact Us')
        except:
            pass
        mail_a = [a.strip() for a in mail_a]
        mail = '@'.join(mail_a)
        il.add_value('email', mail)

        latitude = ''
        longitude = ''
        map_data = cond_set_value(response.xpath(
            './/p[@class="direction_map"]/iframe/@src').extract())
        if map_data:
            try:
                qs = parse_qs(map_data)
                coordinates = qs['ll'][0].split(',')
                latitude = coordinates[0]
                longitude = coordinates[1]
            except:
                pass
        il.add_value('latitude', latitude)
        il.add_value('longitude', longitude)

        return il.load_item()
Example #49
0
    def parse(self, response):
        """
        Default callback used by Scrapy to process downloaded responses

        Testing contracts:
        @url http://odds.500.com/index_jczq_2014-08-29.shtml

        """
        selector = Selector(response)

        # iterate over matchs 
        for match in selector.select(self.match_list_xpath):
            loader = ItemLoader(Match(), selector=match)

            # define processors
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            # iterate over fields and add xpaths to the loader
            for field, xpath in self.match_fields.iteritems():
                loader.add_xpath(field, xpath)

            match_item = loader.load_item()
            match_item["game_date"] = self.game_date
            match_item["season_id"] = match_item["season_id"].split('-')[-1]
            match_item["teama_id"] = match_item["teama_id"].split('-')[-1]
            match_item["teamb_id"] = match_item["teamb_id"].split('-')[-1]
            if "score" in match_item:
                sa, sb = match_item["score"].split(':')
                match_item["score_a"] = sa
                match_item["score_b"] = sb
                match_item["result"] = "win" if sa > sb else "draw" if sa == sb else "lost"
            else:
                match_item["score_a"] = match_item["score_b"] = -1
                match_item["result"] = "none"

            yield match_item

            #scrap asia odds
            #id=454359&ctype=1&start=60&r=1&style=0&guojia=0
            for i in xrange(3):
                url = self.asia_odds_url % (match_item["match_id"], i * 30)
                request = scrapy.Request(url, callback=self.parse_asia_odds)
                request.meta['match_item'] = match_item
                yield request
    def parse_book_url(self, response):
        book_item = BookDetails(book_id = "", book_type = "pdf")
        bil = ItemLoader(item=book_item, response=response)
        bil.add_xpath("book_id", "/*//script/text()", re = r'bookId\s*:\s*(.*),.*')
        bil.add_xpath("book_path", "/*//script/text()", re = r'getDownloadUrl\s*:\s*\"(.*)\".*')
        #bil.get_xpath()
        bil.load_item()
        download_url = self.base_url + book_item['book_path'][0]
        post_data = "book_id=" + book_item['book_id'][0] + "&" + "type=" + book_item['book_type']
        #post_data = "book_id=" + "2759" + "&" + "type=" + book_item['book_type']

        #set header
        post_header = {}
        post_header["Content-Type"] = "application/x-www-form-urlencoded; charset=UTF-8"
        post_header["User-Agent"] = "Mozilla/5.0"
        #print post_header
        #print curl_cmd

        yield Request(download_url, self.get_book_link, headers = post_header, method='POST', body=post_data)
Example #51
0
    def parse(self,response):
        l = ItemLoader(item = googleItem(),response = response)
        l.add_xpath('news','//span[contains(@class,"titletext")]/text()')
        x = l.load_item()
        #print(len(x['date']),len(x['topnews']),len(x['sectionnews']))
        nytdict = dict()
        datelist = []
        datalist = datetime.date.today()
        newslist = []

        nytdict['date'] = str(datalist)

        for t in x['news']:
            newslist.append(str(t.encode('ascii','ignore')))
        nytdict['news']=newslist

        filename = datetime.date.today()
        f=open('{}.json'.format(filename),'w')
        json.dump(nytdict,f)
        return l.load_item()
Example #52
0
 def detail(self, response):
     log.msg(response.url)
     hxs = HtmlXPathSelector(response)
     product_name = hxs.xpath('//*[@id="vip_content_section"]/div[2]/h1/text()').extract()
     # //*[@id="vip_content_section"]/div[2]/h1
     if (len(product_name) != 0):
         product_name = hxs.xpath('//*[@id="vip_content_section"]/div[2]/h1/text()').extract()[0]
     product_price = hxs.xpath('//*[@id="price-val"]/text()').extract()
     if (len(product_price) != 0):
         product_price = hxs.xpath('//*[@id="price-val"]/text()').extract()[0]
     if (len(product_price) != 0 or product_price != None) and (len(product_name) or product_name != None):
         l = ItemLoader(item=BillionPricesIndiaItem(), response=response)
         l.add_xpath('product_name', '//*[@id="vip_content_section"]/div[2]/h1/text()')
         # l.add_xpath('quantity', '//*[@id="product_detail_view_1"]/div/div[1]/div/text()')
         l.add_xpath('category', '//*[@id="cat_crum"]/@value')
         l.add_xpath('product', '//*[@id="overview_tab"]/div/div/p/text()')
         item = l.load_item()
         item['product_url'] = response.url
         item['price'] = product_price
         item['vendor'] = 'PepperFry'
         item['city'] = 'Mumbai'
         item['state'] = 'Maharashtra'
         item['country'] = 'India'
         item['date'] = str(time.strftime("%d/%m/%Y"))
         return item
Example #53
0
 def parse_item(self,response):
     l = ItemLoader(item=MeizituItem(), response=response)
     l.add_xpath('name', '//h2/a/text()')
     l.add_xpath('tags', "//div[@id='maincontent']/div[@class='postmeta clearfix']/div[@class='metaRight']/p")
     l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src", Identity())
     l.add_value('url', response.url)
     return l.load_item()
Example #54
0
 def parse_item(self, response):
     sel = Selector(response)
     
     # collect xpaths of each player (row in table)
     rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr')
     
     for row in rows:
         loader = ItemLoader(GoalSOItem(), selector=row)
         loader.default_input_processor = MapCompose()
         loader.default_output_processor = Join()
         
         # get unique NHL ID number from player's page URL
         num = row.xpath('td[2]/a/@href').extract()
         sNum = num[0][-7:]
         loader.add_value('nhl_num', sNum)
         
         # add season data
         loader.add_value('season', str(self.year))
         
         # collect additional stats
         loader.add_xpath('so_wins', './/td[14]/text()')
         loader.add_xpath('so_losses', './/td[15]/text()')
         loader.add_xpath('so_shots_against', './/td[16]/text()')
         loader.add_xpath('so_goals_against', './/td[17]/text()')
         
         # feed item to pipeline
         yield loader.load_item()
    def parse_template(self, response):
        """
        Callback used by Scrapy to process downloaded responses
        //*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[4]/table/tbody/tr[10]/td[2]
        """
        response_body = response.body_as_unicode()

        # Checking if coffee beans are present in the source, since it shifts down the divs
        coffee = True if 'cups of coffee' in response_body else False

        prop_xpath = '//div[@class="info_wrapper"]//tr[td[@class="key"]/strong/text() = "{}:"]/td[@class="value"]/text()'
        substr_xpath = 'substring-after(normalize-space({}), "{}")'

        item_fields = {
            'item_hash': '//*[@id="offer_sku"]/text()',
            'title': '//*[@id="thing_name"]/text()',
            'thumbnail': '//*[@id="thing_image"]/@src',
            'description': '//*[@id="description"]',
            'creator': '//*[@id="product_manufacturer"]/text()',
            'when': prop_xpath.format('Released'),
            'bootstrap_version': substr_xpath.format(prop_xpath.format('Bootstrap'), 'Compatible with '),
            'cost_single': substr_xpath.format('//*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[{}]/div[1]//span/text()'.format(3 if coffee else 2), '$'),
            'cost_multiple': substr_xpath.format('//*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[{}]/div[2]//span/text()'.format(3 if coffee else 2), '$'),
            'cost_extended': substr_xpath.format('//*[@id="page_theme"]/div[2]/div/div/div/div[2]/div[{}]/div[3]//span/text()'.format(3 if coffee else 2), '$'),
            'purchases': '//div[@class="purchases"]/span[@class="count"]/text()',
        }

        selector = Selector(response)

        loader = ItemLoader(WrapBootstrapTemplate(), selector=selector)

        # define processors
        loader.default_input_processor = MapCompose(unicode.strip)
        loader.default_output_processor = Join()

        # iterate over fields and add xpaths to the loader
        for field, xpath in item_fields.iteritems():
            loader.add_xpath(field, xpath)
        yield loader.load_item()
	def get_app(self, response):
		il = ItemLoader(item=PlayStoreItems(), response=response)
		il.add_css('app_id', '.details-wrapper::attr(data-docid)')
		il.add_css('name', '.document-title div::text')
		il.add_css('category', '.category span::text')
		il.add_css('category_url', '.category::attr(href)',
					Compose(lambda urls: [urljoin(response.url, url) for url in urls]))
		il.add_css('price', '.details-actions .price span::text')
		il.add_css('offers_in_app_purchases', '.inapp-msg::text')
		il.add_css('stars_count', '.stars-count::text')
		il.add_css('video', '.details-trailer > span::attr(data-video-url)')
		il.add_css('screenshots', '.screenshot::attr(src)')
		il.add_xpath('description', '//div[contains(@class, "show-more-content")]/div//text()')
		il.add_css('update_date', '[itemprop="datePublished"]::text')
		il.add_css('file_size', '[itemprop="fileSize"]::text')
		il.add_css('installs', '[itemprop="numDownloads"]::text')
		il.add_css('current_version', '[itemprop="softwareVersion"]::text')
		il.add_css('requires_android', '[itemprop="operatingSystems"]::text')
		il.add_css('offered_by', '[itemprop="author"] > a span::text')
		il.add_css('offered_by_url', '[itemprop="author"] > a::attr(href)',
					Compose(lambda urls: [urljoin(response.url, url) for url in urls]))
		yield il.load_item()
Example #57
0
 def parse_content(self, response):
     bbsItem_loader = ItemLoader(item=BbsItem(), response=response)
     url = str(response.url)
     bbsItem_loader.add_value("url", url)
     bbsItem_loader.add_xpath("forum", self._x_query["forum"])
     bbsItem_loader.add_xpath("poster", self._x_query["poster"])
     bbsItem_loader.add_xpath("content", self._x_query["page_content"])
     return bbsItem_loader.load_item()