Esempi in Python per ItemLoader.default_output_processor, esempi in Python per scrapy.loader.ItemLoader.default_output_processor

Esempio n. 1

0

Mostra file

File: nfl_team_rosters.py Progetto: AncillaryStats/AS-Scrapers

    def get_player_info(self, response):
        loader = ItemLoader(item=NFL_Player_2015(), response=response)
        loader.default_input_processor = MapCompose(unicode.strip)
        loader.default_output_processor = Join()

        number_and_position = response.xpath('//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[1]/text()').extract()[0]
        number_and_position = response.xpath('//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[1]/text()').extract()

        if type(number_and_position) is list:
            number_and_position = number_and_position[0]
            number = number_and_position.split()[0]
            position = number_and_position.split()[1]
        else:
            number = ''
            position = ''

        loader.add_value('number', number)
        loader.add_value('position', position)
        loader.add_xpath('name', '//*[@id="content"]/div[3]/div[2]/h1/text()')
        loader.add_xpath('team', '//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[3]/a/text()')

        # loader.add_xpath('name', '//*[@id="content"]/div[3]/div[2]/h1/text()')
        # loader.add_xpath('team', '//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[3]/a/text()')

        yield loader.load_item()

Esempio n. 2

0

Mostra file

File: dataset_spider.py Progetto: haileypate/atx-data-tracker

    def parse_datasets (self, response):
        il = ItemLoader(item=Dataset(), response=response)
        t = datetime.datetime.now()
        timestamp = t.isoformat()
        il.default_output_processor = MapCompose(lambda v: v.strip())
        il.add_value('scraped_url', response.url)
        il.add_xpath('desc', '//a[contains(., "Description")]/following-sibling::div/p/text()')
        il.add_value('soc_id', response.url[-15:-6])
        il.add_xpath('name', '//*[@id="datasetName"]/text()')
        il.add_xpath('tags', '//dt[text()="Tags"]/following-sibling::dd/span/text()')
        il.add_xpath('permalink', '//dt[text()="Permalink"]/following-sibling::dd[1]/span/a/text()')
        il.add_xpath('dept', '//dt[text()="Department"]/following-sibling::dd[1]/span/text()')
        il.add_xpath('provided_by', '//dt[text()="Data Provided By"]/following-sibling::dd[1]/text()')
        il.add_xpath('category', '//dt[text()="Category"]/following-sibling::dd[1]/text()')
        il.add_xpath('soc_owner', '//a[@class="aboutAuthorName"]/text()')
        # il.add_xpath('contact_email', )
        il.add_xpath('date_created', '//span[@class="aboutCreateDate"]/span[@class="dateReplace"]/text()')
        il.add_xpath('date_updated', '//span[@class="aboutUpdateDate"]/span[@class="dateReplace"]/text()')
        il.add_xpath('num_ratings', '//dd[@class="totalTimesRated"]/text()')
        il.add_xpath('num_visits', '//dt[text()="Visits"]/following-sibling::dd[1]/text()')
        il.add_xpath('num_downloads', '//dt[text()="Downloads"]/following-sibling::dd[1]/text()')
        il.add_xpath('num_comments', '//dd[@class="numberOfComments"]/text()')
        il.add_xpath('num_contributors', '//dt[text()="Downloads"]/following-sibling::dd[1]/text()')
        il.add_xpath('permissions', '//dt[text()="Permissions"]/following-sibling::dd[1]/text()')
        il.add_xpath('row_count',   '//span[contains(@class, "row_count")]/text()')
        il.add_xpath('update_freq', '//dt[text()="Frequency"]/following-sibling::dd[1]/span/text()')
        il.add_value('timestamp', timestamp)
        il.add_xpath('set_type', '//span[@class="icon currentViewName"]/text()')

        return il.load_item()

Esempio n. 3

0

Mostra file

File: seminar.py Progetto: whirlp00l/Seminavi

	def parse(self, response):
		match = re.search('/displaySeminarList/',response.url)

		if match:
			urls = response.xpath('//div[@class="internList splitEntry"]//@href').extract()
			for url in urls:
				url = response.urljoin(url)
				yield scrapy.Request(url, self.parse)
		else:
			table = response.xpath(self.seminar_list_xpath)
			corpId = parse_qs(urlparse(response.url).query)['corpId']
			for index,semi in enumerate(table):
				loader = ItemLoader(SeminarItem(),semi)
				loader.default_input_processor = MapCompose(unicode.strip)
				loader.default_output_processor = Join()
				loader.add_value('companyid',corpId)
				loader.add_xpath('name','//div[@id="headerWrap"]//h3/text()')
				loader.add_xpath('date','.//td[@class="date"]/text()',re='\d+\/\d+\/\d+')
				loader.add_xpath('time','.//td[@class="time"]/text()')
				loader.add_xpath('area','.//td[@class="area"]/text()')
				loader.add_xpath('place','.//td[@class="place"]/text()')
				loader.add_xpath('loc_n','.//td[@class="place"]//a', re='mycom_loc\|(\d+\/\d+\/\d+\.\d+)\,\d+\/\d+\/\d+\.\d+')
				loader.add_xpath('loc_e','.//td[@class="place"]//a', re='mycom_loc\|\d+\/\d+\/\d+\.\d+\,(\d+\/\d+\/\d+\.\d+)')
				loader.add_xpath('target','.//td[@class="target"]/text()')
				yield loader.load_item()

Esempio n. 4

0

Mostra file

File: nfl_team_info_spider.py Progetto: AncillaryStats/AS-Scrapers

    def parse_depth_chart(self, response):
        loader = ItemLoader(item=NFL_Team_2015(), response=response)
        loader.default_input_processor = MapCompose(unicode.strip)
        loader.default_output_processor = Join()

        loader.add_xpath("division", '//*[@id="sub-branding"]/div[2]/text()')
        loader.add_xpath("name", '//*[@id="sub-branding"]/h2/a/b/text()')

        yield loader.load_item()

Esempio n. 5

0

Mostra file

File: usesthis.py Progetto: gbrener/usesthis_crawler

    def parse_article(self, response):
        # Initialize some I/O processors
        join_all = Join('')
        take_first = TakeFirst()
        identity = Identity()
        prepend_url = PrependResponseUrl(response.url)
        strip_all, strip_one = StripAll(), StripOne()
        add_space_after_punct = AddSpaceAfterPunct()

        # Load PersonItem
        person_loader = ItemLoader(item=PersonItem(), response=response)
        person_loader.default_output_processor = take_first
        person_loader.add_css('name', 'h3.p-name::text', strip_all)
        person_loader.add_value('article_url', response.url)
        person_loader.add_css('pub_date', 'time.dt-published::attr(datetime)')
        person_loader.add_css('title', 'p.summary.p-summary::text', strip_all)
        person_loader.add_css('img_src', 'img.portrait::attr(src)', prepend_url)
        person_loader.add_xpath('bio', '//div[@class="e-content"]/p[count(preceding-sibling::h4)=1]/descendant-or-self::*/text()', join_all, add_space_after_punct)
        person_loader.add_xpath('hardware', '//div[@class="e-content"]/p[count(preceding-sibling::h4)=2]/descendant-or-self::*/text()', join_all, add_space_after_punct)
        person_loader.add_xpath('software', '//div[@class="e-content"]/p[count(preceding-sibling::h4)=3]/descendant-or-self::*/text()', join_all, add_space_after_punct)
        person_loader.add_xpath('dream', '//div[@class="e-content"]/p[count(preceding-sibling::h4)=4]/descendant-or-self::*/text()', join_all, add_space_after_punct)
        person_item = person_loader.load_item()

        # @gbrener 8/16/2015: The following line causes a NotImplementedError
        #object.__setattr__(person_item, 'export_empty_fields', True)
        person_item.fill_empty_fields()

        # Load a list of ToolItems
        tool_items = []
        for tool_selector in response.css('div.e-content p a'):
            tool_loader = ItemLoader(item=ToolItem(), selector=tool_selector, response=response)
            tool_loader.default_output_processor = take_first
            tool_loader.add_xpath('tool_name', './descendant-or-self::*/text()', join_all, strip_one)
            tool_loader.add_xpath('tool_url', './@href')
            tool_item = tool_loader.load_item()

            # @gbrener 8/16/2015: The following line causes a NotImplementedError
            #object.__setattr__(tool_item, 'export_empty_fields', True)
            tool_item.fill_empty_fields()

            tool_items.append(tool_item)

        yield dict(person=person_item, tools=tool_items)

Esempio n. 6

0

Mostra file

File: emails.py Progetto: Lazar-T/get_emails

    def parse_item(self, response):
        l = ItemLoader(item=GetEmailsItem(), response=response)
        l.default_output_processor = MapCompose(lambda v: v.strip(), replace_escape_chars)

        emails = response.xpath('//text()').re(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,4}")

        l.add_value('email', emails)
        l.add_value('url', response.url)

        return l.load_item()

Esempio n. 7

0

Mostra file

File: auction_item_spider.py Progetto: fredriksoderberg/auction-analysis

    def parse_auction_item(self, response):
        
        loader = ItemLoader(AuctionItems(), response=response)

        loader.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars)
        loader.default_output_processor = Join()
       
        for field, xpath in auction_item_fields.iteritems():
            loader.add_xpath(field, xpath)        

              
        yield loader.load_item()

Esempio n. 8

0

Mostra file

File: gumtreeSpider.py Progetto: Lazar-T/gumtree-crawler

    def parse_item(self, response):
        """Returns scraped data from each individual job link."""
        l = ItemLoader(item=GumtreeItem(), response=response)
        l.default_output_processor = MapCompose(lambda v: v.strip(), replace_escape_chars)

        l.add_xpath('name', '//h1/text()')
        l.add_xpath('location', './/*[@id="ad-map"]/span[2]/text()')
        l.add_xpath('sender', '//*[@class="name"]/text()')
        l.add_xpath('description', '//*[@id="job-description"]/p/text()')
        l.add_value('job_type', response.meta['job_type'])

        return l.load_item()

Esempio n. 9

0

Mostra file

File: spider.py Progetto: hristo-grudev/csobsk

    def parse_post(self, response):
        data = json.loads(response.text)
        title = data['title']
        description = remove_tags(data['newsDetailContent'])
        date = data['displayDate']

        item = ItemLoader(item=CsobskItem(), response=response)
        item.default_output_processor = TakeFirst()
        item.add_value('title', title)
        item.add_value('description', description)
        item.add_value('date', date)

        return item.load_item()

Esempio n. 10

0

Mostra file

    def parse_post(self, response, title):
        description = response.xpath(
            '//div[@class="entry-content"]//text()[normalize-space()]').getall(
            )
        description = [p.strip() for p in description]
        description = ' '.join(description).strip()

        item = ItemLoader(item=CreditagricoleroItem(), response=response)
        item.default_output_processor = TakeFirst()
        item.add_value('title', title)
        item.add_value('description', description)

        return item.load_item()

Esempio n. 11

0

Mostra file

def load_author(response, author):
    string = response.xpath(author['names']).extract()[0].replace('\n', '').strip()
    names = [str.strip(i) for i in string.replace(' and ', ', ').split(',')]
    for name in names:
        l = ItemLoader(item = AuthorItem(), response = response)
        l.default_output_processor = TakeFirst()
        # author's first name and last name
        flname = name.split()
        fn = flname[0]
        ln = flname[-1]
        l.add_value('fname', fn)
        l.add_value('lname', ln)
        yield l

Esempio n. 12

0

Mostra file

    def parse_post(self, response):
        title = response.xpath('//h2[@class="article-title"]/text()').get()
        description = response.xpath(
            '//article//text()[normalize-space() and not(ancestor::h2)]'
        ).getall()
        description = [p.strip() for p in description]
        description = ' '.join(description).strip()

        item = ItemLoader(item=GbkrsiItem(), response=response)
        item.default_output_processor = TakeFirst()
        item.add_value('title', title)
        item.add_value('description', description)
        return item.load_item()

Esempio n. 13

0

Mostra file

File: emmi.py Progetto: Lazar-T/E-Commerce-scraper

    def parse_item(self, response):
        """Returns fields: url_of_item, product, img_url, description, and price."""

        l = ItemLoader(item=EmmiscraperItem(), response=response)
        l.default_output_processor = MapCompose(lambda v: v.strip(), replace_escape_chars)

        l.add_value('url_of_item', response.url)
        l.add_value('product', response.meta['product'])
        l.add_xpath('img_url', '/html/body/div[3]/div[4]/div[2]/div[2]/div[1]/div[1]/div[1]/a/img/@src')
        l.add_xpath('description', '//*[@class="productListText widthFull noPadding"]/text()')
        l.add_xpath('price', '//*[@class="price"]/text()[2]')

        return l.load_item()

Esempio n. 14

0

Mostra file

File: souqpipeline.py Progetto: mzahran001/Scrapy-CATvid

    def parse(self, response):
        search_results = response.css('.tpl-append-results>div')

        for product in search_results:

            product_loader = ItemLoader(item=SouqItem(), selector=product)
            product_loader.default_output_processor = TakeFirst()
            product_loader.add_css('title', '.itemTitle::text')
            product_loader.add_css('link',
                                   '.item-content> .itemLink::attr(href)')
            product_loader.add_css('price', '.itemPrice::text')
            print('\n')
            yield product_loader.load_item()

Esempio n. 15

0

Mostra file

File: sentinel.py Progetto: CindyvdVries/News_Crawler

 def parse_node(self, response, selector):
     self.logger.info("selector %s", selector )
     l = ItemLoader(SatItem(), selector=selector, response=response)
     l.default_output_processor = TakeFirst()
     l.add_xpath("metadata", "atom:link[@rel='alternative']/@href")
     l.add_xpath("icon", "atom:link[@rel='icon']/@href")
     l.add_xpath("download", "atom:link/@href")
     l.add_xpath('footprint', "atom:str[@name='footprint']/text()")
     l.add_xpath('id', 'atom:id/text()')
     l.add_xpath('identifier', "atom:str[@name='identifier']/text()")
     l.add_value('requestId', self.request_id)
     i = l.load_item()
     return i

Esempio n. 16

0

Mostra file

    def parse_post(self, response):
        title = response.xpath('//h1/text()').get()
        description = response.xpath(
            '//div[@class="main-text"]/p/text()').getall()
        description = ' '.join(description)
        date = response.xpath('//div[@class="date"]/text()').get()

        item = ItemLoader(item=KvikaItem(), response=response)
        item.default_output_processor = TakeFirst()
        item.add_value('title', title)
        item.add_value('description', description)
        item.add_value('date', date)

        return item.load_item()

Esempio n. 17

0

Mostra file

	def parse_post(self, response):
		title = response.xpath('//div[@class="leftColumn"]/h3/text()').get()
		description = response.xpath('//div[@class="leftColumn"]/table//text()[normalize-space()]').getall()
		description = [p.strip() for p in description]
		description = ' '.join(description).strip()
		date = response.xpath('//div[@align="right"]/span[@style]/text()').get()

		item = ItemLoader(item=LibrabankItem(), response=response)
		item.default_output_processor = TakeFirst()
		item.add_value('title', title)
		item.add_value('description', description)
		item.add_value('date', date)

		return item.load_item()

Esempio n. 18

0

Mostra file

    def parse_answers(self, response):
        # use selector to extract answers
        selector = Selector(response)

        # iterate over answers
        for answer in selector.xpath(self.answers_list_xpath):
            loader = ItemLoader(item=ZhihuAnswer(), selector=answer)

            # define processors
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            # iterate over fields and add xpaths to the loader
            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)

            item = loader.load_item()

            # convert the full text of answer into html
            item["answer"] = item["answer"].encode('ascii',
                                                   'xmlcharrefreplace')

            # if summary has image, convert it to html
            if "summary_img" in item:
                item["summary_img"] = item["summary_img"].encode(
                    'ascii', 'xmlcharrefreplace')
            else:
                item['summary_img'] = ""

            # change vote to integer
            item["vote"] = int(item["vote"])

            # in case of anonymous authors
            if "author" not in item:
                item["author"] = u'匿名用户'

            # complete links
            item["question_link"] = u"http://www.zhihu.com" + item[
                "question_link"]

            if "author_link" in item:
                item["author_link"] = u"http://www.zhihu.com" + item[
                    "author_link"]
            else:
                item["author_link"] = ""

            # add the date when scraped
            item["date"] = date.today()

            yield item

Esempio n. 19

0

Mostra file

File: company_details_pipeline.py Progetto: TheKingInDaNorth/scraping_with_python

    def parse(self, response):
        company_results = resposne.xpath(
            '//*[@id="scr-res-table"]/div[1]/table/tbody/tr')

        for company in company_results:
            details_loader = ItemLoader(item=CompanyDetailsItem(),
                                        selector=company)
            details_loader.default_output_processor = TakeFirst()

            details_loader.add_xpath('company_name', 'td[2]/text()')
            details_loader.add_xpath('company_price_intraday',
                                     'td[3]/span/text()')

            yield details_loader.load_item()

Esempio n. 20

0

Mostra file

File: spider.py Progetto: hristo-grudev/nbbank

	def parse_post(self, response, date):
		print(response)
		title = response.xpath('//span[@class="page_heading_inner"]/text()').get()
		description = response.xpath('//div[@class="page_content"]//text()[normalize-space()]').getall()
		description = [p.strip() for p in description]
		description = ' '.join(description).strip()

		item = ItemLoader(item=NbbankItem(), response=response)
		item.default_output_processor = TakeFirst()
		item.add_value('title', title)
		item.add_value('description', description)
		item.add_value('date', date)

		return item.load_item()

Esempio n. 21

0

Mostra file

    def parse_post(self, response, title, date):
        description = response.xpath(
            '//div[@class="module module--push2 richtext"]//text()[normalize-space()]'
        ).getall()
        description = [p.strip() for p in description]
        description = ' '.join(description).strip()

        item = ItemLoader(item=MetzlerItem(), response=response)
        item.default_output_processor = TakeFirst()
        item.add_value('title', title)
        item.add_value('description', description)
        item.add_value('date', date)

        return item.load_item()

Esempio n. 22

0

Mostra file

File: spider.py Progetto: hristo-grudev/fsbwa

	def parse_post(self, response):
		title = response.xpath('//h1/text()').get()
		description = response.xpath('//div[@class="blurb"]//text()[normalize-space()]').getall()
		description = [p.strip() for p in description]
		description = ' '.join(description).strip()
		date = response.xpath('//div[@class="date"]/text()').get()

		item = ItemLoader(item=FsbwaItem(), response=response)
		item.default_output_processor = TakeFirst()
		item.add_value('title', title)
		item.add_value('description', description)
		item.add_value('date', date)

		return item.load_item()

Esempio n. 23

0

Mostra file

File: spider.py Progetto: hristo-grudev/prvabankacg

	def parse_post(self, response):
		title = response.xpath('//h1/text()').get()
		description = response.xpath('//div[@class="container containerDetalji"]/div[@class="row"]//text()[normalize-space() and not(ancestor::h1)]').getall()
		description = [p.strip() for p in description]
		description = ' '.join(description).strip()
		date = response.xpath('//div[@class="datumDetalji"]/p/text()').get()

		item = ItemLoader(item=PrvabankacgItem(), response=response)
		item.default_output_processor = TakeFirst()
		item.add_value('title', title)
		item.add_value('description', description)
		item.add_value('date', date)

		return item.load_item()

Esempio n. 24

0

Mostra file

File: info.py Progetto: shwang-bk/fin4crawl

 def parse(self, response):
     self.logger.info('%s', response.url)
     fields = WarrantInfoItem.Meta.fields
     rows = response.xpath('//tr[count(td)=21]').extract()
     for row in rows:
         loader = ItemLoader(item=WarrantInfoItem(),
                             selector=Selector(text=row))
         loader.default_input_processor = MapCompose(str, str.strip)
         loader.default_output_processor = TakeFirst()
         loader.add_value('date', self.date)
         for idx, field in enumerate(fields, start=1):
             if field:
                 loader.add_xpath(field, f'//td[{idx}]/text()')
         yield loader.load_item()

Esempio n. 25

0

Mostra file

File: spider.py Progetto: SimeonYS/cubbank

    def parse_post(self, response, date, title):
        content = response.xpath('//div[@class="post-body"]//text()').getall()
        content = [p.strip() for p in content if p.strip()]
        content = re.sub(pattern, "", ' '.join(content))

        item = ItemLoader(item=CcubbankItem(), response=response)
        item.default_output_processor = TakeFirst()

        item.add_value('title', title)
        item.add_value('link', response.url)
        item.add_value('content', content)
        item.add_value('date', date)

        yield item.load_item()

Esempio n. 26

0

Mostra file

 def parseCubeInfo(self, response, uid, screen_name, symbolList):
     jsonresponse = json.loads(response.body_as_unicode())
     for s in symbolList:
         loader = ItemLoader(item=StockCubesItem())
         loader.default_input_processor = MapCompose(str)
         loader.default_output_processor = Join(' ')
         for (field, path) in self.jmes_paths.items():
             loader.add_value(field, SelectJmes(path)(jsonresponse[s]))
         item = loader.load_item()
         owner = OwnerItem()
         owner['id'] = uid
         owner['screen_name'] = screen_name
         item['owner'] = owner
         yield item

Esempio n. 27

0

Mostra file

File: crunchbase_spider.py Progetto: digawp/MyScraper

    def parse_partners(self, response):
        company_url = response.xpath(
            '//*[@id="profile_header_heading"]/a/@href').extract_first()
        partner_selectors = response.css('div.partners').xpath(
            './/ul/li//h4/a')

        for sel in partner_selectors:
            loader = ItemLoader(item=Partner(), selector=sel)
            loader.default_input_processor = processors.MapCompose(
                w3lib.html.remove_tags)
            loader.default_output_processor = processors.TakeFirst()
            loader.add_value('focal_company_url', company_url)
            loader.add_xpath('partner_url', '@href')
            yield loader.load_item()

Esempio n. 28

0

Mostra file

File: crunchbase_spider.py Progetto: digawp/MyScraper

    def parse_advisors(self, response):
        company_url = response.xpath(
            '//*[@id="profile_header_heading"]/a/@href').extract_first()
        employee_selector = response.css('div.advisors').xpath('.//ul/li')

        for sel in employee_selector:
            loader = ItemLoader(item=BoardMember(), selector=sel)
            loader.default_input_processor = processors.MapCompose(
                w3lib.html.remove_tags)
            loader.default_output_processor = processors.TakeFirst()
            loader.add_value('company_url', company_url)
            loader.add_xpath('person_url', './/h4/a/@href')
            loader.add_xpath('title', './/h5/text()')
            yield loader.load_item()

Esempio n. 29

0

Mostra file

    def parse_post(self, response):
        title = response.xpath('//h1//text()').get()
        description = response.xpath(
            '(//div[@class="ce-bodytext"])[position()<last()]//text()[normalize-space() and not(ancestor::h1)]'
        ).getall()
        description = [p.strip() for p in description]
        description = ' '.join(description).strip()

        item = ItemLoader(item=AkabankdeItem(), response=response)
        item.default_output_processor = TakeFirst()
        item.add_value('title', title)
        item.add_value('description', description)

        return item.load_item()

Esempio n. 30

0

Mostra file

	def parse_post(self, response):
		title = response.xpath('//h2/text()').get()
		description = response.xpath('//div[@class="container main"]//text()[normalize-space()]').getall()
		description = [p.strip() for p in description if '{' not in p]
		description = ' '.join(description).strip()
		date = re.findall(r'[A-Za-z]+\s\d{1,2},\s\d{4}', description) or ['']

		item = ItemLoader(item=CnbankpaItem(), response=response)
		item.default_output_processor = TakeFirst()
		item.add_value('title', title)
		item.add_value('description', description)
		item.add_value('date', date[0])

		return item.load_item()

Esempio n. 31

0

Mostra file

File: spider.py Progetto: hristo-grudev/bancacrs

	def parse_post(self, response):
		title = response.xpath('//div[@class="contenuti-header-bold-news-rassegna-stampa-dettaglio"]/text()').get()
		description = response.xpath('//div[@class="col-xs-12"][@style="paddijng-right; 0px; padding-left: 0px;"]//text()[normalize-space() and not(ancestor::noscript | ancestor::div[@class="contenuti-header-bold-news-rassegna-stampa-dettaglio"])]').getall()
		description = [p.strip() for p in description]
		description = ' '.join(description).strip()
		date = response.xpath('(//div[@class="contenuti-testo-news-rassegna-stampa"]/p/text())[1]').get()

		item = ItemLoader(item=BancacrsItem(), response=response)
		item.default_output_processor = TakeFirst()
		item.add_value('title', title)
		item.add_value('description', description)
		item.add_value('date', date)

		return item.load_item()

Esempio n. 32

0

Mostra file

File: spider.py Progetto: SimeonYS/banrep

	def parse_post(self, response, date, title):
		content = response.xpath('//div[@class="news-body"]//text() | //div[@class="body field"]//text()[not (ancestor::em)] | //div[contains(@class,"field field-name")]//text()').getall()
		content = [p.strip() for p in content if p.strip()]
		content = re.sub(pattern, "",' '.join(content))

		item = ItemLoader(item=BanrepItem(), response=response)
		item.default_output_processor = TakeFirst()

		item.add_value('title', title)
		item.add_value('link', response.url)
		item.add_value('content', content)
		item.add_value('date', date)

		yield item.load_item()

Esempio n. 33

0

Mostra file

File: spider.py Progetto: hristo-grudev/bcrlocuintero

    def parse_post(self, response, date, title):
        description = response.xpath(
            '//div[@class="w-auto mw-full rte"]//text()[normalize-space()]'
        ).getall()
        description = [p.strip() for p in description]
        description = ' '.join(description).strip()

        item = ItemLoader(item=BcrlocuinteroItem(), response=response)
        item.default_output_processor = TakeFirst()
        item.add_value('title', title)
        item.add_value('description', description)
        item.add_value('date', date)

        return item.load_item()

Esempio n. 34

0

Mostra file

File: irelanduk.py Progetto: daniel-kanchev/bankofirelanduk

    def parse_article(self, response):
        item = ItemLoader(Article())
        item.default_output_processor = TakeFirst()

        title = response.xpath('//h1[@class="entry-title"]/text()').get().strip()
        content = response.xpath('//div[@class="entry-content"]//text()').getall()
        content = [text for text in content if text.strip()]
        content = "\n".join(content).strip()

        item.add_value('title', title)
        item.add_value('link', response.url)
        item.add_value('content', content)

        return item.load_item()

Esempio n. 35

0

Mostra file

File: spider.py Progetto: hristo-grudev/essexbank

	def parse_post(self, response):
		title = response.xpath('//h1[@class="page-title"]/text()').get()
		description = response.xpath('//div[@class="l-content"]//text()[normalize-space() and not(ancestor::h1 | ancestor::h4)]').getall()
		description = [p.strip() for p in description]
		description = ' '.join(description).strip()
		date = response.xpath('//div[@class="l-content"]//h4/text()').get()

		item = ItemLoader(item=EssexbankItem(), response=response)
		item.default_output_processor = TakeFirst()
		item.add_value('title', title)
		item.add_value('description', description)
		item.add_value('date', date)

		return item.load_item()

Esempio n. 36

0

Mostra file

File: ganji.py Progetto: chocoai/estate

    def parse_item(self, response):
        self.logger.info("starting parse item")
        l = ItemLoader(item=IndexItem(), response=response)
        l.default_output_processor = TakeFirst()
        l.add_value("url", response.url)
        l.add_value("retrived", 0)

        l.add_value("source", response.request.url)
        l.add_value("project", self.settings.get("BOT_NAME"))
        l.add_value("spider", self.name)
        l.add_value("server", socket.gethostname())
        l.add_value("date", datetime.datetime.utcnow())

        yield l.load_item()

Esempio n. 37

0

Mostra file

File: spider.py Progetto: hristo-grudev/waynesavings

	def parse_post(self, response):
		title = response.xpath('//h1[@class="edn_articleTitle"]/text()').get()
		description = response.xpath('(//article//p | //article//ul)//text()[normalize-space()]').getall()
		description = [p.strip() for p in description if '{' not in p]
		description = ' '.join(description).strip()
		date = response.xpath('//time/text()').get().split('on')[1]

		item = ItemLoader(item=WaynesavingsItem(), response=response)
		item.default_output_processor = TakeFirst()
		item.add_value('title', title)
		item.add_value('description', description)
		item.add_value('date', date)

		return item.load_item()

Esempio n. 38

0

Mostra file

File: spider.py Progetto: hristo-grudev/commbankcomau

	def parse_post(self, response):
		title = response.xpath('//div[@class="banner-content no-offer"]/h1/text()').get()
		description = response.xpath('//div[@class="article-text text"]//text()[normalize-space()]').getall()
		description = [p.strip() for p in description]
		description = ' '.join(description).strip()
		date = response.xpath('//div[@class="banner-content no-offer"]/p/text()').get()

		item = ItemLoader(item=CommbankcomauItem(), response=response)
		item.default_output_processor = TakeFirst()
		item.add_value('title', title)
		item.add_value('description', description)
		item.add_value('date', date)

		return item.load_item()

Esempio n. 39

0

Mostra file

File: spider.py Progetto: hristo-grudev/kbmk

    def parse_post(self, response):
        title = response.xpath('//h1/text()').get()
        description = response.xpath(
            '//div[@id="printableContent"]//text()[normalize-space() and not(ancestor::h1 | ancestor::script)]'
        ).getall()
        description = [p.strip() for p in description]
        description = ' '.join(description).strip()

        item = ItemLoader(item=KbmkItem(), response=response)
        item.default_output_processor = TakeFirst()
        item.add_value('title', title)
        item.add_value('description', description)

        return item.load_item()

Esempio n. 40

0

Mostra file

	def parse_post(self, response, date):
		title = response.xpath('//h1/text()').get()
		description = response.xpath('//div[@class="entry-content container px-5"]/p/text()').getall()

		description = [p.strip() for p in description]
		description = ' '.join(description).strip()

		item = ItemLoader(item=GuberItem(), response=response)
		item.default_output_processor = TakeFirst()
		item.add_value('title', title)
		item.add_value('description', description)
		item.add_value('date', date)

		return item.load_item()

Esempio n. 41

0

Mostra file

File: auction_bid_spider.py Progetto: fredriksoderberg/auction-analysis

    def parse_bids(self, response):
        
        selector = Selector(response)
        
        for bid in selector.select(self.bid_list_xpath) :
            loader = ItemLoader(BidItems(), selector=bid)

            loader.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars)
            loader.default_output_processor = Join()
       
            for field, xpath in auction_bid_fields.iteritems():
                loader.add_xpath(field, xpath)        

              
            yield loader.load_item()

Esempio n. 42

0

Mostra file

File: uiCompare.py Progetto: AugustLONG/mcubed

    def parse_items(self, response):

        gooseobj = self.g.extract(response.url)
        fulltext = gooseobj.cleaned_text

        il = ItemLoader(item=SoloItem(), response=response)
        il.default_output_processor = MapCompose(
            lambda v: v.rstrip(), lambda v: re.sub(r"[\',|!]", "", v), lambda v: re.sub(r"\s+", " ", v)
        )

        il.add_value("siteurl", parse_base_url(response.url))
        il.add_value("pageurl", response.url)
        il.add_value("text", fulltext.encode("ascii", "ignore"))
        il.add_xpath("pagetitle", "//title/text()")

        return il.load_item()

Esempio n. 43

0

Mostra file

File: wanderu_spider.py Progetto: krishnbx/wanderu-scraper

    def parse(self, response):
        def strip_dollar(x):
            return x.strip('$')




        self.driver.get(response.url)
        try:
            WebDriverWait(self.driver, 15).until(
                EC.presence_of_element_located(
                    (By.XPATH,
                        '//*[@id="depart-container"]/div[2]/div[1]/div/[@style="width: 0%;"]')))
        except TimeoutException:
            print 'Page load time out'
            pass

        while True:
            try:
                try:
                    WebDriverWait(self.driver, 15).until(
                        EC.presence_of_element_located(
                            (By.XPATH,
                                '//*[@id="depart-container"]/div/div/div/button')))
                except TimeoutException:
                    break

                next = self.driver.find_element_by_xpath(
                    '//*[@id="depart-container"]/div/div/div/button')
                next.click()

            except ElementNotVisibleException:
                break
        for trips in Selector(
                text=self.driver.page_source).xpath(self.trips_list_xpath):
            loader = ItemLoader(BusTrip(), selector=trips)

            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()
            loader.price_in = MapCompose(strip_dollar)


            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)
            dateoftrip = str(response.url).split("/")[-1]
            loader.add_value('dateoftrip', dateoftrip.decode('unicode-escape'))
            yield loader.load_item()

Esempio n. 44

0

Mostra file

File: zhihu_spider.py Progetto: naity/zhihu_scraper

    def parse_answers(self, response):
        # use selector to extract answers
        selector = Selector(response)

        # iterate over answers
        for answer in selector.xpath(self.answers_list_xpath):
            loader = ItemLoader(item=ZhihuAnswer(), selector=answer)

            # define processors
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()

            # iterate over fields and add xpaths to the loader
            for field, xpath in self.item_fields.iteritems():
                loader.add_xpath(field, xpath)

            item = loader.load_item()

            # convert the full text of answer into html
            item["answer"] = item["answer"].encode('ascii', 'xmlcharrefreplace')

            # if summary has image, convert it to html
            if "summary_img" in item:
                item["summary_img"] = item["summary_img"].encode('ascii', 'xmlcharrefreplace')
            else:
                item['summary_img'] = ""

            # change vote to integer
            item["vote"] = int(item["vote"])

            # in case of anonymous authors
            if "author" not in item:
                item["author"] = u'匿名用户'

            # complete links
            item["question_link"] = u"http://www.zhihu.com" + item["question_link"]

            if "author_link" in item:
                item["author_link"] = u"http://www.zhihu.com" + item["author_link"]
            else:
                item["author_link"] = ""

            # add the date when scraped
            item["date"] = date.today()

            yield item

Esempio n. 45

0

Mostra file

File: openStackSpider.py Progetto: Lazar-T/conference-crawler

    def parse_item(self, response):
        """ Returns fields from each individual attendee.

        @url http://openstacksummitnovember2014paris.sched.org/cfb
        @scrapes name image_url friends title_company_location links about

        """
        l = ItemLoader(item=ItemloadItem(), response=response)
        l.default_output_processor = MapCompose(lambda v: v.strip(), replace_escape_chars)

        l.add_xpath('name', '//*[@id="sched-page-me-name"]/text()')
        l.add_xpath('image_url', '//*[@id="myavatar"]/@src')
        l.add_xpath('friends', '//*[@id="sched-page-me-connections"]/ul/li/a/@title')
        l.add_xpath('title_company_location', '//*[@id="sched-page-me-profile-data"]/text()')
        l.add_xpath('links', '//*[@class="sched-network-link"]/a/@href')
        l.add_xpath('about', '//*[@id="sched-page-me-profile-about"]/text()')

        return l.load_item()

Esempio n. 46

0

Mostra file

File: catalogscraper.py Progetto: Mesil/Catalog-Record-Scraper

 def parse_CatalogRecord(self, response):
     CatalogRecord = ItemLoader(item=catalogscraperItem(), response=response)
     CatalogRecord.default_output_processor = TakeFirst()
     keywords = '|'.join(r"\b" + re.escape(word.strip()) + r"\b" for word in open('Catalog_Scraper/spiders/keys.txt'))
     r = re.compile('.*(%s).*' % keywords, re.IGNORECASE|re.MULTILINE|re.UNICODE)
     if r.search(response.body_as_unicode()):
         # The following lines tell the spider how to populate the fields defined in "items.py". The first argument of "CatalogRecord.add_xpath" indicated which field the spider is being directed to fill, while the second provides an xpath, directing the spider to where the relevent information is contained on a give webpage.
         CatalogRecord.add_xpath('title', './/div[@id="dublin-core-title"]/div[@class="element-text"]/text()')
         # CatalogRecord.add_xpath('subject', '')
         # CatalogRecord.add_xpath('description', '')
         # CatalogRecord.add_xpath('creator', '')
         # CatalogRecord.add_xpath('source', '')
         # CatalogRecord.add_xpath('published', '')
         # CatalogRecord.add_xpath('published', '')
         # CatalogRecord.add_xpath('rights', '')
         # CatalogRecord.add_xpath('citation', '')
         # CatalogRecord.add_xpath('url', '')
         return CatalogRecord.load_item()

Esempio n. 47

0

Mostra file

File: uiCompare.py Progetto: AugustLONG/mcubed

    def parse_items(self, response):

        gooseobj = self.g.extract(response.url)
        fulltext = gooseobj.cleaned_text

        il = ItemLoader(item=SoloItem(), response=response)
        il.default_output_processor = MapCompose(
            lambda v: v.rstrip(),
            lambda v: re.sub(r'[\',|!]', '', v),
            lambda v: re.sub(r'\s+', ' ', v)
        )

        il.add_value('siteurl', parse_base_url(response.url))
        il.add_value('pageurl', response.url)
        il.add_value('text', fulltext.encode('ascii', 'ignore'))
        il.add_xpath('pagetitle', '//title/text()')

        return il.load_item()

Esempio n. 48

0

Mostra file

File: sus.py Progetto: AugustLONG/mcubed

    def parse_items(self, response):

        # fulltext = self.parse_body_text(response)
        gooseobj = self.g.extract(response.url)
        fulltext = gooseobj.cleaned_text

        il = ItemLoader(item=SuspiderItem(), response=response)
        il.default_output_processor = MapCompose(
            lambda v: v.rstrip(),
            lambda v: re.sub(r'[\',|!]', '', v),
            lambda v: re.sub(r'\s+', ' ', v)
        )

        il.add_value('siteurl', self.parse_base_url(response.url))
        il.add_value('pageurl', response.url)
        il.add_value('text', fulltext.encode('ascii', 'ignore'))
        il.add_xpath('pagetitle', '//title/text()')
        # il.add_xpath('keywords', '//meta[@name="keywords"]/@content')

        yield il.load_item()

Esempio n. 49

0

Mostra file

File: nfl_rb_stats_spider.py Progetto: AncillaryStats/AS-Scrapers

    def parse_game_log(self, response):
        games_table = []

        # If default selector returns empty, try second selector
        rows = response.xpath('//*[@id="content"]/div[6]/div[1]/div/div[4]/div/table[1]/tr')
        if not bool(rows):
            rows = response.xpath('//*[@id="content"]/div[6]/div[1]/div/div[3]/div/table/tr')

        # Build up game stats table to be turned into scrapy items
        for row in rows:

            game_row = []
            for col in row.xpath("td"):

                opponent = col.xpath("ul/li[3]/a/text()").extract()
                game_result = col.xpath("a/text()").extract()
                other_value = col.xpath("text()").extract()

                if opponent:
                    game_row.append(opponent[0])
                elif game_result:
                    game_row.append(game_result[0])
                else:
                    game_row.append(other_value[0])

            games_table.append(game_row)

        # print json.dumps(games_table, indent=1)

        # Load game stat rows into scrapy items
        for game_row in games_table:
            print(game_row)

            loader = ItemLoader(item=NFL_RB_Game_2015(), response=response)
            loader.default_output_processor = TakeFirst()
            loader.add_xpath("player_name", '//*[@id="content"]/div[3]/div[2]/h1/text()', MapCompose(unicode.strip))

            # Handle regular season totals row
            if game_row[0] == "REGULAR SEASON STATS":
                loader.add_value("is_season_totals", True)
                loader.add_value("date", None)
                loader.add_value("opponent", None)
                loader.add_value("result", None)

                loader.add_value("rush_attempts", int(game_row[1]))
                loader.add_value("rush_yards", int(game_row[2]))
                loader.add_value("avg_yards_per_rush", float(game_row[3]))
                loader.add_value("longest_rush", int(game_row[4]))
                loader.add_value("rush_tds", int(game_row[5]))
                loader.add_value("receptions", int(game_row[6]))
                loader.add_value("rec_yards", int(game_row[7]))
                loader.add_value("avg_yards_per_rec", float(game_row[8]))
                loader.add_value("longest_rec", int(game_row[9]))
                loader.add_value("rec_tds", int(game_row[10]))
                loader.add_value("fumbles", int(game_row[11]))
                loader.add_value("fumbles_lost", int(game_row[12]))

                yield loader.load_item()

            # Handle regular season individual game row (ignoes two header rows)
            elif game_row[0] != "2015 REGULAR SEASON GAME LOG" and game_row[0] != "DATE":

                # Parse date from string containing day of the week and date
                day_and_date = game_row[0]
                date = day_and_date.split()[1] + "/15"

                loader.add_value("is_season_totals", False)
                loader.add_value("date", date)
                loader.add_value("opponent", game_row[1])
                loader.add_value("result", game_row[2])

                loader.add_value("rush_attempts", int(game_row[3]))
                loader.add_value("rush_yards", int(game_row[4]))
                loader.add_value("avg_yards_per_rush", float(game_row[5]))
                loader.add_value("longest_rush", int(game_row[6]))
                loader.add_value("rush_tds", int(game_row[7]))
                loader.add_value("receptions", int(game_row[8]))
                loader.add_value("rec_yards", int(game_row[9]))
                loader.add_value("avg_yards_per_rec", float(game_row[10]))
                loader.add_value("longest_rec", int(game_row[11]))
                loader.add_value("rec_tds", int(game_row[12]))
                loader.add_value("fumbles", int(game_row[13]))
                loader.add_value("fumbles_lost", int(game_row[14]))

                yield loader.load_item()

Esempio n. 50

0

Mostra file

File: nfl_wr_stats_spider.py Progetto: AncillaryStats/AS-Scrapers

    def parse_game_log(self, response):
        games_table = []

         # If default selector returns empty, try second selector
        rows = response.xpath('//*[@id="content"]/div[6]/div[1]/div/div[4]/div/table[1]/tr')
        if not bool(rows):
            rows = response.xpath('//*[@id="content"]/div[6]/div[1]/div/div[3]/div/table/tr')

        # Build up game stats table to be turned into scrapy items
        for row in rows:

            game_row = []
            for col in row.xpath('td'):

                opponent = col.xpath('ul/li[3]/a/text()').extract()
                game_result = col.xpath('a/text()').extract()
                other_value = col.xpath('text()').extract()

                if opponent:
                    game_row.append(opponent[0])
                elif game_result:
                    game_row.append(game_result[0])
                else:
                    game_row.append(other_value[0])

            games_table.append(game_row)

        # print json.dumps(games_table, indent=1)

        # Load game stat rows into scrapy items
        for game_row in games_table:
            print(game_row)
            
            # Initialize WR game item to be loaded
            loader = ItemLoader(item=NFL_WR_Game_2015(), response=response)

            # Set default output process to take first item from selector
            loader.default_output_processor = TakeFirst()

            # Add player name
            loader.add_xpath('player_name', '//*[@id="content"]/div[3]/div[2]/h1/text()', MapCompose(unicode.strip))

            # Handle regular season totals row
            if game_row[0] == 'REGULAR SEASON STATS':
                loader.add_value('is_season_totals', True)
                loader.add_value('date', None)
                loader.add_value('opponent', None)
                loader.add_value('result', None)

                loader.add_value('receptions', int(game_row[1]))
                loader.add_value('targets', int(game_row[2]))
                loader.add_value('rec_yards', int(game_row[3]))
                loader.add_value('avg_yards_per_rec', float(game_row[4]))
                loader.add_value('longest_rec', int(game_row[5]))
                loader.add_value('rec_tds', int(game_row[6]))
                loader.add_value('rush_attempts', int(game_row[7]))
                loader.add_value('rush_yards', int(game_row[8]))
                loader.add_value('avg_yards_per_rush', float(game_row[9]))
                loader.add_value('longest_rush', int(game_row[10]))
                loader.add_value('rush_tds', int(game_row[10]))
                loader.add_value('fumbles', int(game_row[11]))
                loader.add_value('fumbles_lost', int(game_row[12]))

                yield loader.load_item()

            # Handle regular season individual game row (ignoes two header rows)
            elif game_row[0] != '2015 REGULAR SEASON GAME LOG' and game_row[0] != 'DATE':

                # Parse date from string containing day of the week and date
                day_and_date = game_row[0]
                date = day_and_date.split()[1] + '/15'

                loader.add_value('is_season_totals', False)
                loader.add_value('date', date)
                loader.add_value('opponent', game_row[1])
                loader.add_value('result', game_row[2])

                loader.add_value('receptions', int(game_row[3]))
                loader.add_value('targets', int(game_row[4]))
                loader.add_value('rec_yards', int(game_row[5]))
                loader.add_value('avg_yards_per_rec', float(game_row[6]))
                loader.add_value('longest_rec', int(game_row[7]))
                loader.add_value('rec_tds', int(game_row[8]))
                loader.add_value('rush_attempts', int(game_row[9]))
                loader.add_value('rush_yards', int(game_row[10]))
                loader.add_value('avg_yards_per_rush', float(game_row[11]))
                loader.add_value('longest_rush', int(game_row[12]))
                loader.add_value('rush_tds', int(game_row[13]))
                loader.add_value('fumbles', int(game_row[14]))
                loader.add_value('fumbles_lost', int(game_row[15]))

                yield loader.load_item()