Python ItemLoader.load_item Examples, scrapy.contrib.loader.ItemLoader.load_item Python Examples

Example #1

0

Show file

File: ListPageSpider.py Project: AmosZ/taobao

    def parse_list(self,response):
        #Get seller attributes
        sel = Selector(response)
        self.page += 1
        for s in sel.xpath(Seller.base_xpath):
            seller_loader = ItemLoader(Seller(),selector=s)
            # iterate over fields and add xpaths to the seller_loader
            seller_loader.add_value('page',self.page)
            seller_loader.add_value('flag','Seller')
            for key,value in Seller.item_fields.iteritems():
                seller_loader.add_xpath(key,value)
            yield seller_loader.load_item()

        #Get commodity attributes
        for s in sel.xpath(Commodity.base_xpath):
            comm_loader = ItemLoader(Commodity(),selector=s)
            comm_loader.add_value('page',self.page)
            comm_loader.add_value('flag','Commodity')
            for key,value in Commodity.item_fields.iteritems():
                comm_loader.add_xpath(key,value)
            yield comm_loader.load_item()


        #Next page
        if(sel.xpath(self.next_page_xpath)):
            yield Request("http://spu.taobao.com/spu/3c/detail.htm" +
                    sel.xpath(self.next_page_xpath).extract()[0],
                    callback=self.parse_list)

Example #2

0

Show file

File: tcSpider.py Project: yongjianmu/DM_Similarity

    def parse(self,response):
        l = ItemLoader(item = timeItem(),response = response)
        #l.add_xpath('topnews','//*[@id="article-container"]/div/div[1]/section/div/article[*]/div/p/text()')
        l.add_xpath('topnews','//*[@id="article-container"]/div/div[1]/section/div/article[*]/div/h2/a/text()')
        l.add_xpath('topnews','//*[@id="article-container"]/div/div[1]/section/div/article[1]/div/div/div[2]/div[*]/h3/a/text()')
        l.add_xpath('sectionnews','//a[contains(@class,"home-columnists-title")]/text()')
        l.add_xpath('sectionnews','//a[contains(@data-event,"hp-news")]/text()')
        x = l.load_item()

        nytdict = dict()
        datelist = []
        datalist = datetime.date.today()
        topnewslist = []
        sectionnewslist = []
        nytdict['date'] = str(datalist)

        for t in x['topnews']:
            topnewslist.append(str(t.encode('ascii','ignore')).strip())
        nytdict['topnews']=topnewslist

        for t in x['sectionnews']:
            sectionnewslist.append(str(t.encode('ascii','ignore')).strip())
        nytdict['sectionnews']=sectionnewslist

        filename = datetime.date.today()
        f=open('{}.json'.format(filename),'w')
        json.dump(nytdict,f)
        return l.load_item()

Example #3

0

Show file

File: yelpspider.py Project: hsd315/qocrawler

    def parse_review(self, response):
        sel = Selector(response)

        if not self._is_right_category(sel):
            self.log('Skip URL: %s' % response.url, level=log.INFO)
            return

        self.log('Parse URL: %s' % response.url, level=log.INFO)

        loader = ItemLoader(item=YelpReview(), selector=sel)
        loader.add_value('crawl_date', '%s' % datetime.utcnow())
        loader.add_value('page_url', response.url)

        # Loop over all the fields we need to extract.
        for field, selector in self._item_selectors.iteritems():
            loader.add_xpath(field, selector)

        master_review = loader.load_item()
        review_selectors = sel.xpath('//div[contains(@class, "review")][@itemprop="review"]')

        for rev_sel in review_selectors:
            review_loader = ItemLoader(item=master_review.copy(), selector=rev_sel)

            for field, selector in self._review_selectors.iteritems():
                review_loader.add_xpath(field, selector)

            yield review_loader.load_item()

        return

Example #4

0

Show file

    def parse_book_url(self, response):
        book_item = BookDetails(book_id="", book_type="pdf")
        bil = ItemLoader(item=book_item, response=response)
        bil.add_xpath("book_id",
                      "/*//script/text()",
                      re=r'bookId\s*:\s*(.*),.*')
        bil.add_xpath("book_path",
                      "/*//script/text()",
                      re=r'getDownloadUrl\s*:\s*\"(.*)\".*')
        #bil.get_xpath()
        bil.load_item()
        download_url = self.base_url + book_item['book_path'][0]
        post_data = "book_id=" + book_item['book_id'][
            0] + "&" + "type=" + book_item['book_type']
        #post_data = "book_id=" + "2759" + "&" + "type=" + book_item['book_type']

        #set header
        post_header = {}
        post_header[
            "Content-Type"] = "application/x-www-form-urlencoded; charset=UTF-8"
        post_header["User-Agent"] = "Mozilla/5.0"
        #print post_header
        #print curl_cmd

        yield Request(download_url,
                      self.get_book_link,
                      headers=post_header,
                      method='POST',
                      body=post_data)

Example #5

0

Show file

File: peru21.py Project: juniorUsca/scrapy-crawler_-_adelmiaz

    def get_new(self, response):
        sel = Selector(response)
        il = ItemLoader(item=New())
        il.add_value('tema', ['Marketing y Publicidad'])
        il.add_value('titulo', sel.xpath('//h1[@itemprop="headline"]/a/text()').extract())
        il.add_value('texto', sel.xpath('//div[@itemprop="articleBody"]').extract())
        il.add_value('fecha', sel.xpath('//div[@itemprop="datePublished"]/text()').extract())
        il.add_value('keywords', sel.xpath('//div[contains(@class,"nota-tags")]//h3/a/text()').extract())
        item = il.load_item()

        if 'titulo' in item:
            pass
        else:
            iln = ItemLoader(item=New())
            iln.add_value('tema', ['Marketing y Publicidad'])
            iln.add_value('titulo', sel.xpath('//h1/text()').extract())
            iln.add_value('texto', sel.xpath('//div[@id="principal"]/div[@class="nota"]/div[3]').extract())
            iln.add_value('fecha', sel.xpath('//div[@class="fecha-nota"]/text()').extract())
            iln.add_value('keywords', sel.xpath('//div[contains(@class,"nota-tags")]//h3/a/text()').extract())
            item = iln.load_item()

        if 'keywords' in item:
            pass
        else:
            item['keywords'] = ['Marketing y Publicidad']
        
        if 'fecha' in item:
            item['fecha'] = self.parse_date(item['fecha'])
        else:
            item['fecha'] = '10/05/2015'
        
        if 'titulo' in item:
            if 'texto' in item:
                yield item

Example #6

0

Show file

File: alibbSpider.py Project: zxdy/alibbSpider

    def parse_content(self, response):

        goods_loader = ItemLoader(item=AlibbItem(), response = response)
        url = str(response.url)
        goods_loader.add_value('url', url)
        goods_loader.add_value('url_hash',hashlib.sha1(url).hexdigest())
        goods_loader.add_xpath('name', self._x_query['title'].encode('utf-8'))

        # detail data
        iDetailDataPattern=re.compile("iDetailData.*};",re.DOTALL)
        detail_data_list=response.xpath('//script').re(iDetailDataPattern)
        detail_data=detail_data_list[0].replace("iDetailData = {","{")
        detail_data=detail_data.replace("};","}")
        detail_data=detail_data.replace("\t|\n|\\","")


        detail_data_json=json.loads(detail_data)
        if len(detail_data_json)!=0:
            properties=detail_data_json['sku']['skuMap'].keys()
            goods_loader.add_value('properties',[property.replace(">",",") for property in properties])

            for attribute in detail_data_json['sku']['skuProps']:
                attributes={}
                options=[value['name'] for value in attribute['value']]
                attributes['name']=attribute['prop']
                attributes['options']=options
                goods_loader.add_value('attributes',attributes)
        else:
            goods_loader.add_value('attributes',"")

        price=response.xpath('//span[re:test(@class,"value price-length-\d$")]/text()').extract()
        goods_loader.add_value('price',price[0] if len(price)>0 else detail_data_json['sku']['price'])

        # detail information
        detail_info_list=response.xpath(self._x_query['detail_info']).extract()
        goods_loader.add_value('parameters', [list(info_list) for info_list in zip(detail_info_list[::2],detail_info_list[1::2])])
        print goods_loader.load_item()['url']



        # profile img
        profile_img_urls=response.xpath('//li/@data-imgs').re("original.*jpg")

        for urls in profile_img_urls:
            profile_img_url=urls.replace("original\":\"http","http")
            goods_loader.add_value("boothes",profile_img_url)


        # big img
        for link in response.xpath('//*[@id="desc-lazyload-container"]/@data-tfs-url').extract():
            yield Request(url = link, meta={'item': goods_loader},callback=self.parse_content_down)

Example #7

0

Show file

File: LocalBanya2Crawler.py Project: satishkt/bpp-india

    def parse_product(self, response):
        product_url = response.url
       # sel = self.selenium
        #sel.open(response.url)
        #time.sleep(2.5)


        selector = Selector(response)

        # //*[@id="product_detail_view_1"]/div/div[6]/div[2]/span[2]
        price = selector.xpath('//*[@id="product_detail_view_1"]/div/div[7]/div[2]/span[2]/text()').extract()
        if not price:
            price = selector.xpath('//*[@id="product_detail_view_1"]/div/div[6]/div[2]/span[2]/text()').extract()
        if not price:
            price = selector.xpath(
                '//*[@id="product_detail_view_1"]/div/div[5]/div[2]/span[2]/text()').extract()
        if not price:
            price = selector.xpath(
                '//*[@id="product_detail_view_1"]/div/div[4]/div[2]/span[2]/text()').extract()
        l = ItemLoader(item=BillionPricesIndiaItem(), response=response)
        l.add_xpath('product_name', '//*[@id="inner"]/div[1]/div[1]/div/div/text()')
        l.add_xpath('quantity', '//*[@id="product_detail_view_1"]/div/div[1]/div/text()')
        l.add_xpath('category', '//*[@id="inner"]/div[1]/div[1]/div/a[1]/text()')
        l.add_xpath('product', '//*[@id="inner"]/div[1]/div[1]/div/a[2]/text()')
        item = l.load_item()
        item['product_url'] = product_url
        item['price'] = price
        item['vendor'] ='Local Banya'
        item['city'] = 'Mumbai'
        item['state'] = 'Maharashtra'
        item['country'] = 'India'
        item['date']=str(time.strftime("%d/%m/%Y"))


        return item

Example #8

0

Show file

File: skater_spider.py Project: kielejocain/NHL_sql

    def parse_item(self, response):
        sel = Selector(response)

        # collect xpaths of each player (row in table)
        rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr')

        # loop through players
        for row in rows:
            loader = ItemLoader(SkatTOIItem(), selector=row)
            loader.default_input_processor = MapCompose()
            loader.default_output_processor = Join()

            # get unique NHL ID number from player's page URL
            num = row.xpath("td[2]/a/@href").extract()
            sNum = num[0][-7:]
            loader.add_value("nhl_num", sNum)

            # add season data
            loader.add_value("season", str(self.year))

            # collect TOI stats after converting from m,mmm:ss to seconds
            i = 5
            CATEG = ["es_toi", "sh_toi", "pp_toi", "toi"]
            while i < 12:
                i += 1
                if i % 2 == 0:
                    temp = row.xpath("td[" + str(i) + "]/text()").extract()[0]
                    sTemp = temp.split(":")
                    sTemp[0] = sTemp[0].replace(",", "")
                    loader.add_value(CATEG[(i - 6) / 2], str(60 * int(sTemp[0]) + int(sTemp[1])))
                else:
                    pass

            # feed item to pipeline
            yield loader.load_item()

Example #9

0

Show file

File: skater_spider.py Project: kielejocain/NHL_sql

    def parse_item(self, response):
        sel = Selector(response)

        # collect xpaths of each player (row in table)
        rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr')

        # loop through players
        for row in rows:
            loader = ItemLoader(SkatSOItem(), selector=row)
            loader.default_input_processor = MapCompose()
            loader.default_output_processor = Join()

            # get unique NHL ID number from player's page URL
            num = row.xpath("td[2]/a/@href").extract()
            sNum = num[0][-7:]
            loader.add_value("nhl_num", sNum)

            # add season data
            loader.add_value("season", str(self.year))

            # collect stats
            loader.add_xpath("so_shots", ".//td[13]/text()")
            loader.add_xpath("so_goals", ".//td[14]/text()")
            loader.add_xpath("so_pct", ".//td[15]/text()")
            loader.add_xpath("game_deciding_goals", ".//td[16]/text()")

            # feed item to pipeline
            yield loader.load_item()

Example #10

0

Show file

    def process_row(self, row, task):
        stats = self.crawler.stats

        l = ItemLoader(WV_DrillingPermit())
        l.add_value(None, row)
        item = l.load_item()

        if item['API'] and item['permit_activity_type'] and item[
                'permit_activity_date']:
            existing_item = self.db.loadItem(
                item, {
                    'API': item['API'],
                    'permit_activity_type': item['permit_activity_type'],
                    'permit_activity_date': item['permit_activity_date']
                })

            if existing_item:
                stats.inc_value('_existing_count', spider=self)
            else:
                stats.inc_value('_new_count', spider=self)
                yield item

                dt = datetime.strptime(item['permit_activity_date'],
                                       '%Y-%m-%d %H:%M:%S')
                #                if item['permit_activity_type'] in ('Permit Issued', 'Permit Commenced', 'Permit Completed'):
                if item['permit_activity_type'] in (
                        'Permit Issued', 'Permits Issued'
                ) and datetime.now() - dt < timedelta(days=365):
                    for item in self.create_feed_entry(item, task):
                        yield item

Example #11

0

Show file

File: acm.py Project: PillowSky/scrapy-bots

 def parse(self, response):
     l = ItemLoader(item=Problem(), response=response)
     d = pyq(response.body)
     l.add_value('id', response.url[-4:])
     l.add_value('title', d('#content_body > center:nth-child(1) > span').text())
     l.add_value('body', d('#content_body').text())
     return l.load_item()

Example #12

0

Show file

File: novel.py Project: kownse/jianshu_scrapy

    def parse(self, response):
        contents = response.xpath(
            '//ul[@class="note-list"]/li/div[@class="content"]')
        for content in contents:
            l = ItemLoader(item=JianshuSummaryItem(),
                           selector=content,
                           response=response)
            l.add_xpath(
                'title', 'a[@class="title"]/text()',
                MapCompose(lambda i: i.replace('|', '.').replace('丨', '.')))
            l.add_xpath('link', 'a[@class="title"]/@href',
                        MapCompose(lambda i: urljoin(response.url, i)))
            l.add_xpath(
                'author',
                'div[@class="author"]/div[@class="info"]/a[@class="nickname"]/text()'
            )
            l.add_xpath(
                'author_url',
                'div[@class="author"]/div[@class="info"]/a[@class="nickname"]/@href',
                MapCompose(lambda i: urljoin(response.url, i)))
            l.add_xpath(
                'timestamp',
                'div[@class="author"]/div[@class="info"]/span[@class="time"]/@data-shared-at'
            )
            l.add_xpath('read', 'div[@class="meta"]/a[1]/text()[2]',
                        MapCompose(str.strip, int))
            l.add_xpath('reply', 'div[@class="meta"]/a[2]/text()[2]',
                        MapCompose(str.strip, int))
            l.add_xpath('like', 'div[@class="meta"]/span[1]/text()',
                        MapCompose(str.strip, int))
            l.add_xpath('money', 'div[@class="meta"]/span[2]/text()',
                        MapCompose(str.strip, int))

            yield l.load_item()
        pass

Example #13

0

Show file

File: reden.py Project: frederik-elwert/UniLu_HS2014_Textanalyse

 def parse_content(self, response):
     '''Parse content pages.'''
     loader = ItemLoader(item=Rede(), response=response)
     # Usually, we are only interested in the first item, e.g. for title, place, etc.
     loader.default_output_processor = TakeFirst()
     # Add fields
     loader.add_value('link', response.url)
     loader.add_css('title', '.text h1', extract_text)
     # Test if text has an abstract
     abstract = response.css('.abstract')
     if abstract:
         loader.add_css('abstract', '.abstract', extract_text)
         loader.add_css('text', '.abstract ~ p:not(.picture)',
                        extract_text, Join('\n'))
     else:
         loader.add_css('text', '.text p:not(.picture)',
                        extract_text, Join('\n'))
     # Metadata are in dt/dd pairs.
     keys = response.css('dl dt::text').extract()
     values = response.css('dl dd::text').extract()
     for key, value in zip(keys, values):
         if key == 'Datum:':
             match = re.search(r'(\d{1,2}\.\d{1,2}\.\d{2,4})', value)
             if match:
                 # '22.03.2011' format
                 value = match.group(1)
                 dt = datetime.strptime(value.encode(ENC), '%d.%m.%Y')
             else:
                 # '22. März 2011' format
                 dt = datetime.strptime(value.encode(ENC), '%d. %B %Y')
             loader.add_value('date', dt.date())
         elif key == 'Ort:':
             loader.add_value('place', value)
     return loader.load_item()

Example #14

0

Show file

File: multiplepages.py Project: wreyesus/python-modules

    def parse_items(self, response):
        item = ItemLoader(Articulos(), response)     
        item.add_xpath('title', '//*[@id="MainContainer"]/article/section[1]/div[1]/div/h2/text()')
        item.add_xpath('description', '//*[@id="MainContainer"]/article/section[1]/div[2]/ul/li[3]/text()')
        yield item.load_item()

# scrapy runspider multiplepages.py -o ../../resources/computrabajo.csv -t csv

Example #15

0

Show file

File: ColoradoFeedGenerator.py Project: netconstructor/scraper-2

 def create_tag(self, feed_entry_id, tag, comment=''):
     # TODO: create tags
     l = ItemLoader(FeedEntryTag())
     l.add_value('feed_entry_id', feed_entry_id)
     l.add_value('tag', tag)
     l.add_value('comment', comment)
     return l.load_item()

Example #16

0

Show file

File: meizitu.py Project: hitrustnet/Python-scrapy

 def parse_item(self, response):
     # 解析http://www.meizitu.com/a/5336.html获取图片URL
     l = ItemLoader(item=MeizituItem(), response=response)
     l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src",
                 Identity())
     l.add_value('url', response.url)
     return l.load_item()

Example #17

0

Show file

File: meinvtu.py Project: LIMr1209/Internet-worm

 def parse_detail(self, response):
     url = response.url
     item = ItemLoader(item=MeizituItem(), response=response)
     item.add_xpath("title", "//h2/a/text()")
     item.add_xpath("image_urls", '//div[@id="picture"]//img/@src')
     item.add_value('url', url)
     return item.load_item()

Example #18

0

Show file

    def parse(self, response):
        data = json.loads(response.text)
        for key in data.keys():
            dataLenth = len(data[key])
            logging.info("total size: " + str(dataLenth))
            for i in range(1, dataLenth):
                logging.info("curIndex: " + str(i))
                content = {}
                logging.info("title: " + data[key][i]["title"])

                loader = ItemLoader(item=NetNews(), response=response)

                content["title"] = data[key][i]["title"]
                loader.add_value("news_title", content["title"])

                content["digest"] = data[key][i]["digest"]
                loader.add_value("news_digest", content["digest"])
                # print "recSource: "+data[key][i]["recSource"]
                content["label"] = data[key][i]["recSource"]
                loader.add_value("news_label", content["label"])

                content["imageUrl"] = data[key][i]["imgsrc"]
                loader.add_value("news_img_url", content["imageUrl"])

                content["id"] = data[key][i]["id"]
                loader.add_value("news_id", content["id"])

                detail = self.getNewsDetail(content["id"])
                # print "body: "+detail
                if (len(detail) != 0):
                    loader.add_value("news_detail", detail)

                # 添加时间戳
                loader.add_value("timestamp", self.get_timestamp())
                yield loader.load_item()

Example #19

0

Show file

File: larepublica.py Project: juniorUsca/scrapy-crawler_-_adelmiaz

    def get_new(self, response):
        sel = Selector(response)
        il = ItemLoader(item=New())
        il.add_value('tema', ['Marketing y Publicidad'])
        il.add_value('titulo', sel.xpath('//h1[@class="glr-post-title glr-mb-10"]/text()').extract())
        il.add_value('texto', sel.xpath('//div[@class="glr-post-entry"]').extract())
        il.add_value('fecha', sel.xpath('//span[@class="glr-left glr-post-date"]/text()').extract())
        il.add_value('keywords', sel.xpath('//div[@class="post-tags"]//a/text()').extract())
        item = il.load_item()
        if 'keywords' in item:
            pass
        else:
            item['keywords'] = ['Marketing y Publicidad']
        
        if 'fecha' in item:
            item['fecha'] = self.parse_date(item['fecha'])
        else:
            item['fecha'] = '10/05/2015'
        
        if 'titulo' in item:
            if 'texto' in item:
                yield item

        '''
        item = New()
        item['tema'] = 'Marketing y Publicidad'
        item['titulo'] = self.parse_html(sel.xpath('//h1[@class="glr-post-title glr-mb-10"]/text()').extract()[0].strip())
        item['texto'] = self.parse_html(sel.xpath('//div[@class="glr-post-entry"]').extract()[0].strip())
        item['fecha'] = self.parse_date(sel.xpath('//span[@class="glr-left glr-post-date"]/text()').extract()[0].strip())
        '''
        #yield item
        '''res = []

Example #20

0

Show file

File: youtubespyder.py Project: amk2/PythonScrapy

    def parse(self, response):
        for sel in response.css("ul#channels-browse-content-grid > li"):
            loader = ItemLoader(YoutubeVideo(), selector=sel)
 
            loader.add_xpath('link', './/h3/a/@href')
            
            yield loader.load_item()

Example #21

0

Show file

File: quotes_spider.py Project: csyr/admin10000

    def parse(self, response):

        content = response.body
        page = response.url.split("/")[-1]
        """
        content = Selector(response=response).xpath("//div[@class='body textStyle']").extract()
        if (len(content)):
            content = content[0]
        #踢除标签
        strip = StripTags()
        content = strip.filterTags(content)
        #写文件
        filename = 'quotes-%s' % page
        with open(filename, 'w') as f:
            f.write(str(content))
        self.log('Saved file %s' % filename)
        """

        loader = ItemLoader(item=TutorialItem(), response=response)
        loader.add_xpath('title', "//title/text()")
        loader.add_xpath('content', "//div[@class='body textStyle']")
        data = loader.load_item()

        downFile = DownFile(data['content'][0], 'http://www.admin10000.com')
        downFile.downImgFile()

        mongo = Mongo("articles")
        mongo.setTable("admin10000")
        content = data['content'][0]
        # 踢除标签
        strip = StripTags()
        content = strip.filterTags(content)

        article = {'title': data['title'][0], 'content': content}
        mongo.add(article)

Example #22

0

Show file

File: __init__.py Project: ecotg/Misc

    def parse_by_product(self, response):
        """
        For the 'Bundles' category, grab the product details for the first
        product listed.
        """
        self.selector = Selector(response)
        self.results = self.selector.xpath('//*[@id="ctl00_tdMainPanel"]')
        loader = ItemLoader(item = VisionsProduct(),
                            selector = self.results[0])

        self.field_xpaths = {
            'product': ('div[contains(@class, "catalogueTitle")]'
                        '/h3/text()'),
            'price': ('div[@id="ctl00_ContentPlaceHolder1_pnl'
                      'Bundle"]/div[@id="divProductDetails"]/div'
                      '[contains(@class, "priceAddToCart")]/div[1]/span'
                      '[contains(@id, "SalePrice")]/text()')
        }

        # Extract and load product details
        loader.add_xpath('product', self.field_xpaths['product'])
        loader.add_xpath('price', self.field_xpaths['price'],
                         re = '\$[\d]*[,]*[\d]*\.[\d]*')
        loader.add_value('availability', 'Not Limited/Clearance Item')

        # Because it's an individual product page, manually set the category
        self.category = '/'.join(['Home', response.url.split('/')[4]])
        loader.add_value('category', self.category)

        yield loader.load_item()

Example #23

0

Show file

File: vkwall.py Project: kreedz/Scrapy

 def parse(self, response):
     sel = response.xpath('.//*[@class="post_info"]')
     if not sel:
         self.log('posts are not find')
         return
     self.group_id = response.xpath('.//div[@id="group_followers"]/a/@href').re('group.=(\d+?)$')[0]
     for s in sel:
         wall_text = s.xpath('div[@class="wall_text"]')
         text = wall_text.xpath('div/div[@class="wall_post_text"]').extract()
         spam_words = get_spam_words_from_msg(text, self.spam_words_from_file)
         if spam_words:
             l = ItemLoader(item=VkItem(), selector=s, response=response)
             date = s.xpath('div[@class="replies"]/div/small/a[1]/span/text()').extract()
             date = l.get_value(date, MapCompose(normalize_date), TakeFirst())
             if is_date_less_last_date(date, self.days_count_to_parse):
                 return
             l.add_value('id', wall_text.xpath('div/a/@data-from-id').extract())
             l.add_value('name', wall_text.xpath('div/a/text()').extract())
             l.add_value('text', text)
             l.add_value('date', date)
             l.add_value('words', spam_words)
             yield l.load_item()
             #ban => Request()
         replies_hidden = s.xpath('.//a[@class="wr_header"]/@onclick')
         if replies_hidden:
             url = get_url_hided_replies(replies_hidden[0].extract(), self.main_page)
             yield Request(url=url, callback=self.get_hided_items)
         else:
             replies = s.xpath('.//div[@class="reply_table"]').extract()
             for reply in replies:
                 raw_html = ''.join(reply.splitlines()).encode('utf-8')
                 html_response = HtmlResponse(url=response.url, body=raw_html)
                 for i in self.get_replies_items(html_response):
                     yield i.load_item()
     yield Request(url=self.get_next_msgs_url(), method='POST', callback=self.parse, body=self.get_post_body_for_next_msgs())

Example #24

0

Show file

File: gestion.py Project: juniorUsca/scrapy-crawler_-_adelmiaz

    def get_new(self, response):
        sel = Selector(response)
        il = ItemLoader(item=New())
        il.add_value('tema', ['Marketing y Publicidad'])
        il.add_value('titulo', sel.xpath('//h1/text()').extract())
        il.add_value('texto', sel.xpath('//div[contains(@class,"post-detalle")]').extract())
        il.add_value('fecha', sel.xpath('//p[@itemprop="datePublished"]/text()').extract())
        il.add_value('keywords', sel.xpath('//div[contains(@class,"tags")]/a/text()').extract())
        item = il.load_item()

        if 'titulo' in item:
            pass
        else:
            print item['titulo']
            print item['texto']

        if 'keywords' in item:
            pass
        else:
            item['keywords'] = ['Marketing y Publicidad']
        
        if 'fecha' in item:
            item['fecha'] = self.parse_date(item['fecha'])
        else:
            item['fecha'] = '10/05/2015'
        
        if 'titulo' in item:
            if 'texto' in item:
                yield item

Example #25

0

Show file

File: NrcFeedGenerator.py Project: netconstructor/scraper-2

    def create_tag_items (self, task_id, item_id):
        tag_items = []

        l = ItemLoader (FeedEntryTag())
        l.add_value ('feed_entry_id', item_id)
        l.add_value ('tag', 'NRC')
        tag_items.append(l.load_item())

        nrc_tags = self.db.loadNrcTags(task_id)
        for t in nrc_tags:
            l = ItemLoader (FeedEntryTag())
            l.add_value ('feed_entry_id', item_id)
            l.add_value ('tag', t['tag'])
            l.add_value ('comment', t['comment'])
            tag_items.append(l.load_item())
        return tag_items

Example #26

0

Show file

File: imdb_spider.py Project: ramsayleung/my-python-

 def parse_movie(self, response):
     loader = ItemLoader(item=MovieItem(), response=response)
     loader.add_xpath(
         'name',
         '//div[@id="title-overview-widget"]/div[2]/div[2]/div/div[2]/div[2]/h1/text()'
     )
     loader.add_xpath('year', "//h1/span[@id='titleYear']/a/text()")
     loader.add_xpath(
         'rate',
         "//div[@id='title-overview-widget']/div[2]/div[2]/div/div[1]/div[1]/div[1]/strong/span/text()"
     )
     loader.add_xpath('director',
                      "//div[2]/div[1]/div[2]/span/a/span/text()")
     loader.add_xpath('director',
                      "//div[3]/div[1]/div[2]/span/a/span/text()")
     loader.add_xpath('storyline',
                      "//div[@id='titleStoryLine']/div[1]/p/text()")
     user_review_url = response.xpath(
         "//div[@id='titleUserReviewsTeaser']/div/div[3]/a[2]/@href"
     ).extract()
     item = loader.load_item()
     user_review_another_url = response.xpath(
         "//div[@id='titleUserReviewsTeaser']/div/div[2]/a[2]/@href"
     ).extract()
     if user_review_url or user_review_another_url:
         full_url = 0
         if not user_review_another_url:
             full_url = urljoin(response.url, user_review_url.pop())
         elif not user_review_url:
             full_url = urljoin(response.url, user_review_another_url.pop())
         request = Request(urljoin(response.url, full_url),
                           callback=self.parse_audience_review)
         request.meta['item'] = item
         return request
     return item

Example #27

0

Show file

File: Xs.py Project: kevking/xs

 def parse(self, response):
     items = ItemLoader(item=XsContentItem(), response=response)
     #章节标题
     items.add_xpath('title', '//*[@class="bookname"]/h1/text()')
     #正文
     items.add_xpath('text', '//*[@id="content"]/text()')
     yield items.load_item()

Example #28

0

Show file

File: skater_spider.py Project: kielejocain/NHL_sql

    def parse_item(self, response):
        sel = Selector(response)

        # collect xpaths of each player (row in table)
        rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr')

        # prepare to adjust for shootout stats if necessary
        shootout = 0
        if self.year > 2005:
            shootout = 1

        # loop through players
        for row in rows:
            loader = ItemLoader(SkatEngItem(), selector=row)
            loader.default_input_processor = MapCompose()
            loader.default_output_processor = Join()

            # get unique NHL ID number from player's page URL
            num = row.xpath("td[2]/a/@href").extract()
            sNum = num[0][-7:]
            loader.add_value("nhl_num", sNum)

            # add season data
            loader.add_value("season", str(self.year))

            # collect stats
            if shootout:
                loader.add_xpath("en_goals", ".//td[20]/text()")
                loader.add_xpath("ps_goals", ".//td[21]/text()")
            else:
                loader.add_xpath("en_goals", ".//td[21]/text()")
                loader.add_xpath("ps_goals", ".//td[22]/text()")

            # feed item to pipeline
            yield loader.load_item()

Example #29

0

Show file

File: skater_spider.py Project: kielejocain/NHL_sql

    def parse_item(self, response):
        sel = Selector(response)

        # collect xpaths of each player (row in table)
        rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr')

        # loop through players
        for row in rows:
            loader = ItemLoader(SkatRTSItem(), selector=row)
            loader.default_input_processor = MapCompose()
            loader.default_output_processor = Join()

            # get unique NHL ID number from player's page URL
            num = row.xpath("td[2]/a/@href").extract()
            sNum = num[0][-7:]
            loader.add_value("nhl_num", sNum)

            # add season data
            loader.add_value("season", str(self.year))

            # collect stats
            loader.add_xpath("hits", ".//td[6]/text()")
            loader.add_xpath("blocked_shots", ".//td[7]/text()")
            loader.add_xpath("missed_shots", ".//td[8]/text()")
            loader.add_xpath("giveaways", ".//td[9]/text()")
            loader.add_xpath("takeaways", ".//td[10]/text()")
            loader.add_xpath("faceoff_wins", ".//td[11]/text()")
            loader.add_xpath("faceoff_losses", ".//td[12]/text()")

            # feed item to pipeline
            yield loader.load_item()

Example #30

0

Show file

File: bizwiki.py Project: Ksynko/business_directories_spider

    def parse_item(self, response):
        """This method will not populate such fields:
        locality, mobile_number, country, email
        """
        il = ItemLoader(item=UKBusinessItem(), response=response)

        il.add_value('url', unicode(response.url))
        il.add_xpath('name', '//h3[@class="biz"]/text()')
        il.add_xpath('category', '//div[@id="breadcrumbs"]/a[2]/text()')

        bcon_list = response.xpath('//ul[@class="bcon"]/li')
        for li in bcon_list:
            li_text = cond_set_value(li.xpath('.//b/text()').extract())
            if li_text == 'Tel:':
                phone_number = cond_set_value(li.xpath('text()').extract())
                il.add_value('phone_number', phone_number)
            if li_text == 'Web:':
                website = cond_set_value(li.xpath('.//a/text()').extract())
                il.add_value('website', website)
            if li_text == 'Fax:':
                fax_number = cond_set_value(li.xpath('text()').extract())
                il.add_value('fax_number', fax_number)

        address_list = response.xpath('//ul[@class="bad"]/li/text()').extract()
        if address_list:
            address_without_postal_code = u', '.join(address_list[:-1])
            postal_code = address_list[-1]
            il.add_value('address', address_without_postal_code)
            il.add_value('postal_code', postal_code)

        il.add_xpath('latitude', '//div[@id="lat"]/text()')
        il.add_xpath('longitude', '//div[@id="lng"]/text()')

        return il.load_item()

Example #31

0

Show file

File: pf_organizers.py Project: satishkt/bpp-india

 def detail(self, response):
     log.msg(response.url)
     hxs = HtmlXPathSelector(response)
     product_name = hxs.xpath('//*[@id="vip_content_section"]/div[2]/h1/text()').extract()
     # //*[@id="vip_content_section"]/div[2]/h1
     if (len(product_name) != 0):
         product_name = hxs.xpath('//*[@id="vip_content_section"]/div[2]/h1/text()').extract()[0]
     product_price = hxs.xpath('//*[@id="price-val"]/text()').extract()
     if (len(product_price) != 0):
         product_price = hxs.xpath('//*[@id="price-val"]/text()').extract()[0]
     if (len(product_price) != 0 or product_price != None) and (len(product_name) or product_name != None):
         l = ItemLoader(item=BillionPricesIndiaItem(), response=response)
         l.add_xpath('product_name', '//*[@id="vip_content_section"]/div[2]/h1/text()')
         # l.add_xpath('quantity', '//*[@id="product_detail_view_1"]/div/div[1]/div/text()')
         l.add_xpath('category', '//*[@id="cat_crum"]/@value')
         l.add_xpath('product', '//*[@id="overview_tab"]/div/div/p/text()')
         item = l.load_item()
         item['product_url'] = response.url
         item['price'] = product_price
         item['vendor'] = 'PepperFry'
         item['city'] = 'Mumbai'
         item['state'] = 'Maharashtra'
         item['country'] = 'India'
         item['date'] = str(time.strftime("%d/%m/%Y"))
         return item

Example #32

0

Show file

File: habr.py Project: kreedz/Scrapy

 def parse_item(self, response):
     xpath = './/div[@class="content_left"]'
     sel = response.xpath(xpath)
     if not sel:
         return
     l = ItemLoader(item=HabrahabrItem(), selector=sel, response=response)
     l.add_xpath('title', '//h1/span/text()')
     l.add_xpath('image_urls', '//div[@class="content html_format"]/img/@src')
     comments_items = []
     comments = sel.xpath('//div[starts-with(@class, "message html_format")]').extract()
     for comment in comments:
         comment_item = ItemLoader(item=HabrahabrComment(), selector=sel, response=response)
         comment_item.add_value('comment', comment)
         comments_items.append(comment_item.load_item())
     l.add_value('comments', comments_items)
     yield l.load_item()

Example #33

0

Show file

 def detail(self, response):
     log.msg(response.url)
     hxs = HtmlXPathSelector(response)
     product_name = hxs.xpath(
         '//*[@id="vip_content_section"]/div[2]/h1/text()').extract()
     # //*[@id="vip_content_section"]/div[2]/h1
     if (len(product_name) != 0):
         product_name = hxs.xpath(
             '//*[@id="vip_content_section"]/div[2]/h1/text()').extract()[0]
     product_price = hxs.xpath('//*[@id="price-val"]/text()').extract()
     if (len(product_price) != 0):
         product_price = hxs.xpath(
             '//*[@id="price-val"]/text()').extract()[0]
     if (len(product_price) != 0
             or product_price != None) and (len(product_name)
                                            or product_name != None):
         l = ItemLoader(item=BillionPricesIndiaItem(), response=response)
         l.add_xpath('product_name',
                     '//*[@id="vip_content_section"]/div[2]/h1/text()')
         # l.add_xpath('quantity', '//*[@id="product_detail_view_1"]/div/div[1]/div/text()')
         l.add_xpath('category', '//*[@id="cat_crum"]/@value')
         l.add_xpath('product', '//*[@id="overview_tab"]/div/div/p/text()')
         item = l.load_item()
         item['product_url'] = response.url
         item['price'] = product_price
         item['vendor'] = 'PepperFry'
         item['city'] = 'Mumbai'
         item['state'] = 'Maharashtra'
         item['country'] = 'India'
         item['date'] = str(time.strftime("%d/%m/%Y"))
         return item

Example #34

0

Show file

    def get_product_details(self, response):
        crumbs = self.get_breadcrumbs(response)
        loader = ItemLoader(item=VisionsProduct())

        loader.add_value('breadcrumbs', crumbs)
        loader.add_value('url', response.url)

        if isinstance(crumbs, basestring):
            loader.add_value('category', crumbs)

        # Ensure we aren't wasting time extracting from an empty page
        if extract_helper(response, self.EMPTY_PAGE_CHECK):
            for d in self.PRODUCT_DETAILS:
                if '_' not in d.name:  # Don't load price
                    loader.add_value(d.name, 'N/A')
        else:
            productDetails = detailsRunner(self.PRODUCT_DETAILS,
                                           response=response)

            if not productDetails['price']:
                productDetails['price'] = productDetails['price_gif']

            productDetails.pop('price_gif')

            # Fix truncated image urls
            if productDetails['image']:
                productDetails['image'] = add_schema(response.url,
                                                     productDetails['image'])

            for d in productDetails:
                loader.add_value(d, productDetails[d])

        yield loader.load_item()

Example #35

0

Show file

File: playstore.py Project: yoophi/major-scrapy-spiders

 def get_app(self, response):
     il = ItemLoader(item=PlayStoreItems(), response=response)
     il.add_css('app_id', '.details-wrapper::attr(data-docid)')
     il.add_css('name', '.document-title div::text')
     il.add_css('category', '.category span::text')
     il.add_css(
         'category_url', '.category::attr(href)',
         Compose(lambda urls: [urljoin(response.url, url) for url in urls]))
     il.add_css('price', '.details-actions .price span::text')
     il.add_css('offers_in_app_purchases', '.inapp-msg::text')
     il.add_css('stars_count', '.stars-count::text')
     il.add_css('video', '.details-trailer > span::attr(data-video-url)')
     il.add_css('screenshots', '.screenshot::attr(src)')
     il.add_xpath(
         'description',
         '//div[contains(@class, "show-more-content")]/div//text()')
     il.add_css('update_date', '[itemprop="datePublished"]::text')
     il.add_css('file_size', '[itemprop="fileSize"]::text')
     il.add_css('installs', '[itemprop="numDownloads"]::text')
     il.add_css('current_version', '[itemprop="softwareVersion"]::text')
     il.add_css('requires_android', '[itemprop="operatingSystems"]::text')
     il.add_css('offered_by', '[itemprop="author"] > a span::text')
     il.add_css(
         'offered_by_url', '[itemprop="author"] > a::attr(href)',
         Compose(lambda urls: [urljoin(response.url, url) for url in urls]))
     yield il.load_item()

Example #36

0

Show file

 def parse(self, response):
     l = ItemLoader(item=JianshuArticleItem(), response=response)
     l.add_xpath(
         'content',
         '//div[@class="article"]/div[@class="show-content"]/p/text()')
     l.add_value('url', response.url)
     return l.load_item()

Example #37

0

Show file

File: meizitu_sc.py Project: Chrisn2o/Anti-Anti-Spider

 def parse_item(self, response):
     #l=用ItemLoader载入MeizituItem()
     re  =  []
     l = ItemLoader(item=MeizituItem(), response=response)
     #名字
     l.add_xpath('name', '//h2/a/text()')
     #标签
     l.add_xpath('tags', "//div[@id='maincontent']/div[@class='postmeta  clearfix']/div[@class='metaRight']/p")
     #图片连接
     l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src", Identity())
     #url
     l.add_value('url', response.url)
     re.append(l.load_item())
     print re
     #return re
     return l.load_item()

Example #38

0

Show file

File: ColoradoFeedGenerator.py Project: SkyTruth/scraper

 def create_tag (self, feed_entry_id,  tag, comment = ''):
     # TODO: create tags
     l = ItemLoader (FeedEntryTag())
     l.add_value ('feed_entry_id', feed_entry_id)
     l.add_value ('tag', tag)
     l.add_value ('comment', comment)
     return l.load_item()

Example #39

0

Show file

File: goalie_spider.py Project: kielejocain/NHL_sql

 def parse_item(self, response):
     sel = Selector(response)
     
     # collect xpaths of each player (row in table)
     rows = sel.xpath('/html//div[@class="table-container"]/table/tbody/tr')
     
     for row in rows:
         loader = ItemLoader(GoalSTItem(), selector=row)
         loader.default_input_processor = MapCompose()
         loader.default_output_processor = Join()
         
         # get unique NHL ID number from player's page URL
         num = row.xpath('td[2]/a/@href').extract()
         sNum = num[0][-7:]
         loader.add_value('nhl_num', sNum)
         
         # add season data
         loader.add_value('season', str(self.year))
         
         # collect additional stats
         loader.add_xpath('es_shots_against', './/td[6]/text()')
         loader.add_xpath('es_goals_against', './/td[7]/text()')
         loader.add_xpath('es_saves', './/td[8]/text()')
         loader.add_xpath('es_save_pct', './/td[9]/text()')
         loader.add_xpath('pp_shots_against', './/td[10]/text()')
         loader.add_xpath('pp_goals_against', './/td[11]/text()')
         loader.add_xpath('pp_saves', './/td[12]/text()')
         loader.add_xpath('pp_save_pct', './/td[13]/text()')
         loader.add_xpath('sh_shots_against', './/td[14]/text()')
         loader.add_xpath('sh_goals_against', './/td[15]/text()')
         loader.add_xpath('sh_saves', './/td[16]/text()')
         loader.add_xpath('sh_save_pct', './/td[17]/text()')
         
         # feed item to pipeline
         yield loader.load_item()

Example #40

0

Show file

File: news_36kr.py Project: ahhuisg/news-crawler

    def parse_news(self, response):
    	item = ItemLoader(item=NewsItem(), response=response)
    	item.add_value('url', response.url)
    	item.add_value('title', response.xpath("//h1[@class='single-post__title']/text()").extract()[0])
    	item.add_value('content', response.xpath("//section[@class='article']/p/text()").extract())

    	return item.load_item()

Example #41

0

Show file

File: myspider.py Project: szqh97/test

 def parse(self, response):
     l = ItemLoader(item=MyItem(), response=response)
     l.add_xpath(
         "title",
         """//div[@class="carousel"]/div[@class="songlist-slides slide-page"]/ul[@class="list-songlist slide-item"]/li[@class="songlist-item"]/a[@class="lnk-songlist"]/@title""",
     )
     return l.load_item()

Example #42

0

Show file

File: produtos_saraiva_spider.py Project: janes/recsys-tcc-crawler

    def parse_product(self, response):
        p = ItemLoader(item=Product(), response=response)
        p.add_css('nome', 'h1.livedata::text')
        p.add_value('url', response.url)
        p.add_css('descricaoLonga', '.desc-info')
        p.add_css('image',
                  'div.container-product-image a.image-link > img',
                  re='src=[\"|\'](?P<src>[^\"\']+)[\"|\']')
        p.add_css('categorias', 'span[itemprop=title]::text')
        yield p.load_item()


#executar no mongo
#db.produto.remove({'categorias.0': {$exists: false}})
#db.produto.remove({'categorias.0': {$nin: [' Games', ' Livros', ' DVDs e Blu-ray']}})

#deleta produtos duplicados
#var duplicates = [];

#db.produto_novo.aggregate([
#{"$group" : { "_id": "$nome", "count": { "$sum": 1 }, "dups": { "$addToSet": "$_id" },  }},
#{"$match": {"_id" :{ "$ne" : null } , "count" : {"$gt": 1} } }]
#,{allowDiskUse: true},{cursor:{}}
#).result.forEach(function(doc) {
#doc.dups.shift();
#doc.dups.forEach( function(dupId){
#duplicates.push(dupId);
#}
#)
#})
#printjson(duplicates);
#db.produto_novo.remove({_id:{$in:duplicates}})

Example #43

0

Show file

File: vitocars.py Project: aim110/razbor

    def parse_parts2(self, response):
        log.msg("\tparse_parts time: %s" % int(time.time()), level=log.DEBUG)
        ua = response.request.headers['User-Agent']
        log.msg("\tua: %s" % ua, level=log.DEBUG)

        for part in response.css('table.parts > tbody > tr'):
            il = ItemLoader(item=CarPart(), selector=part)
            il.add_xpath('shop_city', "td[@class='shop']/a/text()")
            il.add_xpath('shop_name', "td[@class='shop']/a/strong/text()")

            shop_url = il.get_xpath("td[@class='shop']/a/@href", TakeFirst())
            photo_url = il.get_xpath("td[@class='photo']/a/@href", TakeFirst())
            il.add_value('shop_url', urljoin(self.main_url, shop_url))
            il.add_value('ext_link', urljoin(self.main_url, photo_url))

            il.add_xpath('info', "td[@class='info']//text()")
            il.add_xpath('price', "td[@class='price']//text()")

            il.add_value('brand', response.meta.get('brand'))
            il.add_value('model', response.meta.get('model'))
            il.add_value('car_part', response.meta.get('car_part'))
            il.add_value('category', response.meta.get('category'))

            item = il.load_item()
            if item.is_valid():
                yield item

Example #44

0

Show file

File: thomson.py Project: Ksynko/business_directories_spider

    def parse_item(self, response):
        request_again = self.error_handler(response)
        if request_again:
            return request_again
        il = ItemLoader(item=UKBusinessItem(), response=response)

        # From the OG section at the top
        il.add_xpath('name', '//meta[@property="og:title"]/@content')
        il.add_xpath('url', '//meta[@property="og:url"]/@content')
        il.add_xpath('latitude', '//meta[@property="og:latitude"]/@content')
        il.add_xpath('longitude', '//meta[@property="og:longitude"]/@content')
        il.add_xpath('address', '//meta[@property="og:street-address"]/@content')
        il.add_xpath('locality', '//meta[@property="og:locality"]/@content')
        il.add_xpath('postal_code', '//meta[@property="og:postal-code"]/@content')
        il.add_xpath('country', '//meta[@property="og:country-name"]/@content')

        # XPaths below are from the display
        il.add_xpath('name', '//span[@class="busname"]/text()')
        # No OG for this
        il.add_xpath('phone_number', '//span[@class="bustel"]/text()')
        il.add_xpath('website', '//a[@id="linkWebsite"]/@href')
        il.add_xpath('address', '//span[@data-yext="address.address"]/text()')
        il.add_xpath('locality', '//span[@itemprop="addressLocality"]/text()')
        il.add_xpath('postal_code', '//span[@itemprop="postalCode"]/text()')
        # Unicoded so it can share an input processor with the rest
        il.add_value('url', unicode(response.url))
        return il.load_item()

Example #45

0

Show file

File: handbook.py Project: ltiao/unsw-catalog-old

 def parse_course_item(self, response):
     url_obj = urlparse(response.url)
     l = ItemLoader(item=CourseItem(), response=response)
     l.default_input_processor = MapCompose(unicode.strip)
     l.default_output_processor = TakeFirst()
     l.add_xpath('code', "/html/head/meta[@name='DC.Subject.ProgramCode']/@content")
     l.add_xpath('name', "/html/head/meta[@name='DC.Subject.Description.Short']/@content")
     l.add_xpath('career', "/html/head/meta[@name='DC.Subject.Level']/@content")
     l.year_in = Identity()
     l.add_value('year', ppath.basename(ppath.dirname(url_obj.path)))
     l.add_value('src_url', unicode(response.url))
     l.add_xpath('uoc', "/html/head/meta[@name='DC.Subject.UOC']/@content")
     l.gened_in = MapCompose(unicode.strip, lambda s: s == 'Y')
     l.add_xpath('gened', "/html/head/meta[@name='DC.Subject.GenED']/@content")
     l.add_xpath('faculty', "/html/head/meta[@name='DC.Subject.Faculty']/@content")
     l.add_xpath('school', ( "//div[@class='column content-col']/div[@class='internalContentWrapper']"
                             "/div[@class='summary']/p[strong[text()[contains(.,'School')]]]/a/text()"))
     l.add_xpath('campus', ( "//div[@class='column content-col']/div[@class='internalContentWrapper']"
                             "/div[@class='summary']/p[strong[text()[contains(.,'Campus')]]]/text()"))
     l.add_xpath('prereqs_str', ( "//div[@class='column content-col']/div[@class='internalContentWrapper']"
                     "/div[@class='summary']/p[text()[contains(.,'Prerequisite:')]]/text()"), 
                     re=r'Prerequisite:\s(.+)')
     l.add_xpath('eftsl', ( "//div[@class='column content-col']/div[@class='internalContentWrapper']"
                             "/div[@class='summary']/p[strong[text()[contains(.,'EFTSL')]]]/text()"))
     l.add_xpath('description_markup', ( "//div[@class='column content-col']/div[@class='internalContentWrapper']"
                                         "/h2[text()='Description']/following-sibling::div"))
     course_item = l.load_item()
     yield course_item
     yield Request(url=response.xpath(("//div[@class='column content-col']/div[@class='internalContentWrapper']"
                                     "/div[@class='summary']//a[text()[contains(.,'Timetable')]]/@href")).extract()[0], 
                     callback=self.parse_class_item, 
                     meta=dict(course_identifier={k: course_item.get(k, None) for k in ('code', 'career', 'year', )}))

Example #46

0

Show file

File: ukdirectory.py Project: Ksynko/business_directories_spider

    def parse_item(self, response):
        """Fields not populated by this method:
        email, mobile_number, latitude, longitude.
        """
        il = ItemLoader(item=UKBusinessItem(), response=response)
        il.add_value("url", unicode(response.url))
        il.add_xpath("name", './/h1[@class="title"]/a/text() | .//h1[@class="title"]/text()')

        address_text = response.xpath(".//address/text()[normalize-space()]").extract()
        address_text = [part.strip().rstrip(",") for part in address_text]
        address = ", ".join(address_text)
        il.add_value("address", address)

        il.add_xpath("postal_code", './/h3[@class="postcode"]/text()')
        il.add_xpath(
            "website",
            './/div[@class="contact-info"]//strong/a/@href |' './/div[@class="contact-info"]/ul/strong/span/text()',
        )
        il.add_xpath("category", './/ul[contains(@class, "breadcrumb")]/li[last()]/a/text()')
        il.add_xpath("linkedin", './/ul[contains(@class, "social")]/li[@class="linkedIn"]/a/@href')
        il.add_xpath("description", './/div[@class="about-text"]/p/text()')

        phones_sp = response.xpath('.//div[@class="contact-info"]//li/span')
        for span in phones_sp:
            text = cond_set_value(span.xpath("text()[normalize-space()]").extract(), "")
            if "T:" in text:
                phone_number = cond_set_value(span.xpath(".//div/text()").extract())
                il.add_value("phone_number", phone_number)
            if "F:" in text:
                fax_number = cond_set_value(span.xpath(".//div/text()").extract())
                il.add_value("fax_number", fax_number)

        return il.load_item()

Example #47

0

Show file

 def parsePage(self, response):
     rentHouse = ItemLoader(item = RentItem(), response = response)
     rentHouse.add_value('id', self.name + '-' +
                         response.url.split('/')[-1].split('.')[0])
     rentHouse.add_value('link', response.url)
     rentHouse.add_xpath('title', "//dl[@class = 'title']/dt/p/text()")
     return rentHouse.load_item()

Example #48

0

Show file

File: NrcBot.py Project: netconstructor/scraper-2

 def make_bot_task_error(self, task_id, code, message=''):
     t = ItemLoader(BotTaskError())
     t.message_in = lambda slist: [s[:1023] for s in slist]
     t.add_value('task_id', task_id)
     t.add_value('bot', self.name)
     t.add_value('code', code)
     t.add_value('message', message)
     return t.load_item()

Example #49

0

Show file

File: crawlpictureSpider.py Project: bingo-chou/crawlpicture

 def parse_item(self, response):
     l = ItemLoader(item=CrawlpictureItem(), response=response)
     l.add_xpath('name', '//h2/a/text()')
     l.add_css('tags', 'div.metaRight p::text')
     #l.add_xpath('image_urls','//div[@id="picture"]/p/img/@src' or '//img[@class="scrollLoading"]/@src',Identity())
     l.add_css('image_urls', 'div.postContent img::attr(src)', Identity())
     l.add_value('url', response.url)
     return l.load_item()

Example #50

0

Show file

File: letushide.py Project: OstrichSec/proxyhunter-2

    def parse_item(self, response):

        for e in response.xpath('//table[@id="basic"]/tbody/tr'):
            l = ItemLoader(ProxyHunterItem(), selector=e)
            l.add_xpath('ip', 'td[2]/a/text()')
            l.add_xpath('port', 'td[3]/text()')
            l.add_xpath('prot', 'td[4]/a/text()')
            yield l.load_item()

Example #51

0

Show file

File: meizi_spider.py Project: delav/Scrapy

    def parse_item(self, response):
        l = ItemLoader(item=MeizituItem(), response=response)
        # l.add_xpath('name', '//div[@class="postContent"]/div[@id="picture"]/p/a/text()')
        # l.add_xpath('tags', '//div[@class="postContent"]')
        l.add_xpath('img_url', '//div[@class="text"]/p/br/img/@src', Identity())
        l.add_value('url', response.url)

        return l.load_item()

Example #52

0

Show file

File: meizitu.py Project: willame/python

    def parse_item(self, response):
        l = ItemLoader(item=MeizituItem(), response=response)
        l.add_xpath('name', '//h2/a/text()')
        l.add_xpath('tags', "//div[@id='maincontent']/div[@class='postmeta  clearfix']/div[@class='metaRight']/p")
        l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src", Identity())

        l.add_value('url', response.url)
        return l.load_item()

Example #53

0

Show file

File: itemloadertest.py Project: Shellbye/Core_Python_Programming

 def parse(self, response):
     item_list = []
     for a in response.css(".menu_box .menu_main h2"):
         l = ItemLoader(item=CategoryItem(), response=response)
         # l.add_css('category', ".menu_box .menu_main h2")
         l.add_value("category", a.extract(), self.get_text)
         item_list.append(l.load_item())
     return item_list

Example #54

0

Show file

File: xq.py Project: dannielei/xqtest

    def parse2(self, response):
        item = json.loads(response.body_as_unicode())
        for i in range(len(item['list'])):
            data_tmp = item['list'][i]

            loader = ItemLoader(item=XqtestItem())
            loader.add_value('title', data_tmp['data'])
            org = loader.load_item()
            yield org

Example #55

0

Show file

File: test_contrib_loader.py Project: zrbruce/scrapy

 def test_load_item_using_default_loader(self):
     i = TestItem()
     i['summary'] = u'lala'
     il = ItemLoader(item=i)
     il.add_value('name', u'marta')
     item = il.load_item()
     assert item is i
     self.assertEqual(item['summary'], u'lala')
     self.assertEqual(item['name'], [u'marta'])

Example #56

0

Show file

    def parse(self, response):
        sel = Selector(response)
        articulos = sel.xpath('/html/body/div[2]/div/div/div/div[1]/div[3]/div')

        for i, elem in enumerate(articulos):
            item = ItemLoader(Articulos(), elem)
            item.add_xpath('title', './/h3/text()')
            item.add_value('id', i)
            yield item.load_item()