Esempio n. 1
0
    def parse_post(self, response, date):
        title = response.xpath(
            '//div[@class="lead top-margin3-xs"]/p//text()').get()
        if not title:
            title = response.xpath(
                '//div[@class="lead top-margin3-xs"]/text()').get()
        content = response.xpath(
            '//div[@class="body-copy4 parbase section"]//text()[not (ancestor::script)]'
        ).getall()
        content = [p.strip() for p in content if p.strip()]
        content = re.sub(pattern, "", ' '.join(content))

        item = ItemLoader(item=BbanksaItem(), response=response)
        item.default_output_processor = TakeFirst()

        item.add_value('title', title)
        item.add_value('link', response.url)
        item.add_value('content', content)
        item.add_value('date', date)

        yield item.load_item()
Esempio n. 2
0
    def parse_article(self, response, date):
        if 'pdf' in response.url:
            return

        item = ItemLoader(Article())
        item.default_output_processor = TakeFirst()

        title = response.xpath('//h2//text()').get()
        if title:
            title = title.strip()

        content = response.xpath('//div[@class="col-sm-9"]//text()').getall()
        content = [text for text in content if text.strip()]
        content = "\n".join(content).strip()

        item.add_value('title', title)
        item.add_value('date', date)
        item.add_value('link', response.url)
        item.add_value('content', content)

        return item.load_item()
Esempio n. 3
0
    def parse_article(self, response, date):
        if 'pdf' in response.url.lower():
            return

        item = ItemLoader(Article())
        item.default_output_processor = TakeFirst()

        title = response.xpath('//title/text()').get()
        if title:
            title = title.split('-')[0].strip()

        content = response.xpath('//div[@data-emptytext="Text"]//text()').getall()
        content = [text.strip() for text in content if text.strip() and '{' not in text]
        content = " ".join(content).strip()

        item.add_value('title', title)
        item.add_value('date', date)
        item.add_value('link', response.url)
        item.add_value('content', content)

        return item.load_item()
Esempio n. 4
0
    def parse_post(self, response):
        try:
            date = response.xpath('//h1/text()').get().split('–')[0].strip()
            title = response.xpath('//h1/text()').get().split('–')[1].strip()
        except IndexError:
            date = response.xpath('//h1/text()').get().split('-')[0].strip()
            title = response.xpath('//h1/text()').get().split('-')[1].strip()
        content = response.xpath(
            '//div[@class="page-content"]//text()').getall()
        content = [p.strip() for p in content if p.strip()]
        content = re.sub(pattern, "", ' '.join(content))

        item = ItemLoader(item=NnovoItem(), response=response)
        item.default_output_processor = TakeFirst()

        item.add_value('title', title)
        item.add_value('link', response.url)
        item.add_value('content', content)
        item.add_value('date', date)

        yield item.load_item()
Esempio n. 5
0
    def parse_post(self, response):
        date = response.xpath(
            '//span[contains(@class,"story-date")]/text()').get()
        date = re.findall(r'\d+\s\w+\s\d+', date)
        title = response.xpath('//h1/text()').get()
        content = response.xpath(
            '//div[@class="story__column story__column--content"]//text()[not (ancestor::h1 or ancestor::p[@class="story-intro"])]'
        ).getall() + response.xpath(
            '//section[@class="story__content"]//text()').getall()
        content = [p.strip() for p in content if p.strip()]
        content = re.sub(pattern, "", ' '.join(content))

        item = ItemLoader(item=CbcItem(), response=response)
        item.default_output_processor = TakeFirst()

        item.add_value('title', title)
        item.add_value('link', response.url)
        item.add_value('content', content)
        item.add_value('date', date)

        yield item.load_item()
Esempio n. 6
0
    def parse_post(self, response):
        data = requests.request("GET",
                                response.url,
                                headers=headers,
                                data=payload)
        data = scrapy.Selector(text=data.text)
        title = data.xpath('//h3[@class="HeaderTitle"]/span/text()').get()
        description = data.xpath(
            '//div[@class="PageContent"]//p//text()[normalize-space() and not(ancestor::i)]'
        ).getall()
        description = [p.strip() for p in description if '{' not in p]
        description = ' '.join(description).strip()
        date = data.xpath('//p[@style="text-align:right;"]/i/text()').get()

        item = ItemLoader(item=AbkeahliItem(), response=response)
        item.default_output_processor = TakeFirst()
        item.add_value('title', title)
        item.add_value('description', description)
        item.add_value('date', date)

        return item.load_item()
Esempio n. 7
0
    def parse_post(self, response):
        date = response.xpath(
            '//div[@class="news nyhedsdetaljedato col-md-12"]/text()').get()
        title = response.xpath(
            '//div[@class="news h2 nyhedsdetaljeteaser col-md-12"]/text() | //h1/text()'
        ).get()
        content = response.xpath(
            '//div[@class="nyhedsdetaljetekst col-md-6"]//text() | //div[@class="col-md-8 txtbox"]//text()'
        ).getall()
        content = [p.strip() for p in content if p.strip()]
        content = re.sub(pattern, "", ' '.join(content))

        item = ItemLoader(item=SalingItem(), response=response)
        item.default_output_processor = TakeFirst()

        item.add_value('title', title)
        item.add_value('link', response.url)
        item.add_value('content', content)
        item.add_value('date', date)

        yield item.load_item()
Esempio n. 8
0
    def parse_post(self, response):
        title = response.xpath(
            '//div[@class="FrutigerNeueMedium_18 color_green align_left"]//text()'
        ).get()
        description = response.xpath(
            '//div[@class="proximanova_16_light line_height_20"]//text()[normalize-space()]'
        ).getall()
        description = [p.strip() for p in description]
        description = ' '.join(description).strip()
        try:
            date = re.findall(r'\d{1,2}\s[a-z]{3,}\s\d{4}', description)[0]
        except:
            date = ''

        item = ItemLoader(item=CrcentoitItem(), response=response)
        item.default_output_processor = TakeFirst()
        item.add_value('title', title)
        item.add_value('description', description)
        item.add_value('date', date)

        return item.load_item()
Esempio n. 9
0
class NewsItems(scrapy.Item):
    UrlHash = scrapy.Field(input_processor=MapCompose(u2int),output_processor=TakeFirst())
    Link = scrapy.Field( input_processor=MapCompose(str.strip),output_processor=TakeFirst())
    Epic = scrapy.Field( input_processor=MapCompose(str.strip),output_processor=TakeFirst())
    Article = scrapy.Field(input_processor=MapCompose(StrProcess),output_processor=TakeFirst())
    SHash = scrapy.Field( input_processor=MapCompose(u2int),output_processor=TakeFirst())
    Artlen = scrapy.Field(input_processor=MapCompose(u2int),output_processor=TakeFirst())

    def __str__(self):
        return ""



    # def __repr__(self):
    #     """only print out attr1 after exiting the Pipeline"""
    #     return repr({"Title": self.Title})
Esempio n. 10
0
    def parse_article(self, response):
        if 'pdf' in response.url or 'jpg' in response.url or 'mp4' in response.url:
            return

        item = ItemLoader(Article())
        item.default_output_processor = TakeFirst()

        title = response.xpath('//h1//text()').getall() or response.xpath(
            '//h2//text()').getall()
        if not title:
            return
        title = [text for text in title if text.strip()]
        title = "\n".join(title).strip()

        date = response.xpath(
            '//div[@class="elementor-widget-container"]/div[@class="elementor-text-editor'
            ' elementor-clearfix"]/p/text()').get()
        if date:
            date = date.strip()
            if not date.split()[-1].isnumeric():
                date = ''
            else:
                date = " ".join(date.split()[-3:])
        elif not response.xpath('//h1//text()').getall():
            date = response.xpath('//h2/text()').get().split()[0]
        else:
            date = ''

        content = response.xpath('//div[@class="elementor-inner"]//text()').getall() or \
                  response.xpath('//div[@itemprop="text"]//text()').getall()

        content = [text for text in content if text.strip()]
        content = "\n".join(content).strip()

        item.add_value('title', title)
        item.add_value('date', date)
        item.add_value('link', response.url)
        item.add_value('content', content)

        return item.load_item()
Esempio n. 11
0
    def parse_article(self, response):
        item = ItemLoader(Article())
        item.default_output_processor = TakeFirst()

        title = response.xpath('//h1/text()').get().strip()
        date = response.xpath(
            '//span[@itemprop="datePublished"]/text()').get().strip()
        try:
            date = datetime.strptime(date, '%B %d, %Y')
        except:
            return
        date = date.strftime('%Y/%m/%d')
        content = response.xpath('//div[@class="C06 "]//text()').getall()
        content = [text for text in content if text.strip()]
        content = "\n".join(content).strip()

        item.add_value('title', title)
        item.add_value('date', date)
        item.add_value('link', response.url)
        item.add_value('content', content)

        return item.load_item()
Esempio n. 12
0
    def parse_post(self, response):
        date = response.xpath('(//div[@class="Normal"])[1]/p/text()').get()
        try:
            date = re.findall(r'\w+\s\d+\,\s\d+', date)
        except TypeError:
            date = ""
        title = response.xpath('//h1/text()').get()
        content = response.xpath(
            '//div[@class="Normal"]//text()[not (ancestor::div[@class="sidebarHIDE"] or ancestor::div[@class="container topFooterpadding"])]'
        ).getall()
        content = [p.strip() for p in content if p.strip()]
        content = re.sub(pattern, "", ' '.join(content))

        item = ItemLoader(item=CcdbtItem(), response=response)
        item.default_output_processor = TakeFirst()

        item.add_value('title', title)
        item.add_value('link', response.url)
        item.add_value('content', content)
        item.add_value('date', date)

        yield item.load_item()
Esempio n. 13
0
    def parse_article(self, response):
        item = ItemLoader(Article())
        item.default_output_processor = TakeFirst()

        title = response.xpath('//h2/text()').get()
        if title:
            title = title.strip()

        date = response.xpath('//h4/text()').get()
        if date:
            date = date.strip()

        content = response.xpath('//p[@align="justify"]//text()').getall() or response.xpath('//p//text()').getall()
        content = [text for text in content if text.strip()]
        content = "\n".join(content).strip()

        item.add_value('title', title)
        item.add_value('date', date)
        item.add_value('link', response.url)
        item.add_value('content', content)

        return item.load_item()
Esempio n. 14
0
    def parse_post(self, response):
        date = '-'
        title = response.xpath(
            '//div[@class="site-main-content"]/h3/strong/text() | //h1/text()'
        ).get()
        if not title:
            title = 'NordfynsBank'
        content = response.xpath(
            '//div[@class="site-main-content"]//text()[not (ancestor::strong) and not(ancestor::h1)]'
        ).getall()
        content = [p.strip() for p in content if p.strip()]
        content = re.sub(pattern, "", ' '.join(content))

        item = ItemLoader(item=NordfyItem(), response=response)
        item.default_output_processor = TakeFirst()

        item.add_value('title', title)
        item.add_value('link', response.url)
        item.add_value('content', content)
        item.add_value('date', date)

        yield item.load_item()
Esempio n. 15
0
    def parse_post(self, response):

        date = response.xpath(
            '//ul[@class="blog__inner__layer__header__icons list-inline"]/li/span//text()'
        ).get()
        title = response.xpath(
            '//h1[@class="blog__inner__layer__header__title"]/text()').get(
            ).strip()
        content = response.xpath(
            '//div[@class="entry-content"]//text()').getall()
        content = [p.strip() for p in content if p.strip()]
        content = re.sub(pattern, "", ' '.join(content))

        item = ItemLoader(item=UnicreditItem(), response=response)
        item.default_output_processor = TakeFirst()

        item.add_value('title', title)
        item.add_value('link', response.url)
        item.add_value('content', content)
        item.add_value('date', date)

        return item.load_item()
Esempio n. 16
0
    def parse_post(self, response):
        date = response.xpath(
            '//p[@class="index-module--date--2S-d0"]/text()').get()
        title = response.xpath(
            '//h1[@class="index-module--title--mvcCi"]/text()').get()
        content = response.xpath(
            '//h4[@class="index-module--manchet--37gxk"]//text()').getall(
            ) + response.xpath(
                '//div[@class="index-module--sectionsWrapper--1bEQn"]//text()'
            ).getall()
        content = [p.strip() for p in content if p.strip()]
        content = re.sub(pattern, "", ' '.join(content))

        item = ItemLoader(item=LunarItem(), response=response)
        item.default_output_processor = TakeFirst()

        item.add_value('title', title)
        item.add_value('link', response.url)
        item.add_value('content', content)
        item.add_value('date', date)

        yield item.load_item()
Esempio n. 17
0
class _VauvaCommentItem(Item):
    """
    Returned comment fields:
        * author (str): Author of the comment.
        * date (str): Publish time of the comment.
        * quotes (list of str): List of quotes in the comment.
        * content (str): Contents of the comment.
        * upvotes (int): Upvotes of the comment.
        * downvotes (int): Downvotes of the comment.
    """
    author = Field(input_processor=strip_join, output_processor=TakeFirst())
    date = Field(input_processor=strip_join,
                 output_processor=Compose(strip_elements, TakeFirst()))
    quotes = Field(input_processor=drop_empty_elements,
                   output_processor=Identity())
    content = Field(input_processor=strip_join, output_processor=TakeFirst())
    upvotes = Field(input_processor=MapCompose(TakeFirst(), safe_cast_int),
                    output_processor=TakeFirst())
    downvotes = Field(input_processor=MapCompose(TakeFirst(), safe_cast_int),
                      output_processor=TakeFirst())
Esempio n. 18
0
    def parse_post(self, response, date):

        title = [
            response.xpath(
                '//div[@class="container-content-inner"]/h3[@class="text-center small_section_title"]/text()'
            ).get() + response.xpath(
                '//div[@class="container-content-inner"]/h1/text()').get()
        ]
        content = response.xpath(
            '//div[@itemprop="articleBody"]//text()').getall()
        content = [p.strip() for p in content if p.strip()]
        content = re.sub(pattern, "", ' '.join(content))

        item = ItemLoader(item=CdbItem(), response=response)
        item.default_output_processor = TakeFirst()

        item.add_value('title', title)
        item.add_value('link', response.url)
        item.add_value('content', content)
        item.add_value('date', date)

        yield item.load_item()
Esempio n. 19
0
    def parse_post(self, response, date):

        title = response.xpath(
            '//div[@class="wd_newsfeed_releases-detail"]/div[@class="wd_title wd_language_left"]/text()'
        ).get()
        content = response.xpath(
            '//div[@id="wd_printable_content"]//text()[not (ancestor::style) and not(ancestor::div[@class="wd_title wd_language_left"])]'
        ).getall()
        content = [p.strip() for p in content if p.strip()]
        if 'About CIBC' in content:
            content = content[:-6]
        content = re.sub(pattern, "", ' '.join(content))

        item = ItemLoader(item=CibcItem(), response=response)
        item.default_output_processor = TakeFirst()

        item.add_value('title', title)
        item.add_value('link', response.url)
        item.add_value('content', content)
        item.add_value('date', date)

        yield item.load_item()
Esempio n. 20
0
	def parse_post(self, response):
		articles = response.xpath('//dl[@class="faq"]/dt')
		length = len(articles)

		for index in range(length):
			item = ItemLoader(item=CroatiaItem(), response=response)
			item.default_output_processor = TakeFirst()

			date = response.xpath(f'(//dl[@class="faq"]/dt)[{index + 1}]//span//text()').get()
			title = response.xpath(f'(//dl[@class="faq"]/dt)[{index + 1}]//text()[2]').get()
			if title:
				title =title.strip()
			content = response.xpath(f'(//dl[@class="faq"]/dd)[{index + 1}]//text()').getall()
			content = [p.strip() for p in content if p.strip()]
			content = re.sub(pattern, "",' '.join(content))

			item.add_value('title', title)
			item.add_value('link', response.url)
			item.add_value('content', content)
			item.add_value('date', date)

			yield item.load_item()
Esempio n. 21
0
    def parse_article(self, response):
        item = ItemLoader(Article())
        item.default_output_processor = TakeFirst()

        title = response.xpath('//h1[@class="content-title"]//text()').get()
        if title:
            title = title.strip()

        date = response.xpath('//div[@class="date mb5"]//text()').get()
        if date:
            date = date.strip()

        content = response.xpath('//div[@class="content-body"]//text()').getall()
        content = [text for text in content if text.strip()]
        content = "\n".join(content[2:]).strip()

        item.add_value('title', title)
        item.add_value('date', date)
        item.add_value('link', response.url)
        item.add_value('content', content)

        return item.load_item()
Esempio n. 22
0
	def parse_post(self, response):
		if response.url[-10:] == 'izjava-kfi':
			return
		title = response.xpath('//h1//text()[normalize-space()]').get()
		description = response.xpath('//div[@class="contents"]//text()[normalize-space()and not(ancestor::a | ancestor::h1)]').getall()
		description = [p.strip() for p in description]
		description = ' '.join(description).strip()
		try:
			date = re.findall(r'\d+\.?\s*\w+\s\d{4}', description)[0]
		except:
			print(response.url)
			date = ''

		description = re.sub(r'\d+\.?\s*\w+\s\d{4}', '', description)

		item = ItemLoader(item=NlbsiItem(), response=response)
		item.default_output_processor = TakeFirst()
		item.add_value('title', title)
		item.add_value('description', description)
		item.add_value('date', date)

		return item.load_item()
Esempio n. 23
0
    def parse_post(self, response):

        date = response.xpath(
            '//div[@class="blog-item-data"]//text()').getall()
        date = ''.join([p.strip() for p in date if p.strip()])
        title = response.xpath(
            '//h2[@class="blog-item-title font-alt"]//text()').get()
        content = response.xpath(
            '//div[@class="field field-name-body field-type-text-with-summary field-label-hidden"]//text()'
        ).getall()
        content = [p.strip() for p in content if p.strip()]
        content = re.sub(pattern, "", ' '.join(content))

        item = ItemLoader(item=AncdordiaItem(), response=response)
        item.default_output_processor = TakeFirst()

        item.add_value('title', title)
        item.add_value('link', response.url)
        item.add_value('content', content)
        item.add_value('date', date)

        yield item.load_item()
Esempio n. 24
0
    def parse_article(self, response, date):
        item = ItemLoader(Article())
        item.default_output_processor = TakeFirst()

        title = response.xpath('//h3[@class="no-pad-top"]/text()').get()
        if title:
            title = title.strip()

        date = datetime.strptime(date.strip(), '%d.%m.%Y')
        date = date.strftime('%Y/%m/%d')

        content = response.xpath(
            '//div[@class="news-box-wrap no-border-bottom"]//text()').getall()
        content = [text for text in content if text.strip()]
        content = "\n".join(content[1:]).strip()

        item.add_value('title', title)
        item.add_value('date', date)
        item.add_value('link', response.url)
        item.add_value('content', content)

        return item.load_item()
Esempio n. 25
0
    def parse_article(self, response, date):
        item = ItemLoader(Article())
        item.default_output_processor = TakeFirst()

        title = " ".join(response.xpath('//h1//text()').getall())
        if title:
            title = title.strip()

        if date:
            date = datetime.strptime(date.strip(), '%d/%m/%Y')
            date = date.strftime('%Y/%m/%d')

        content = response.xpath('//div[@itemprop="articleBody"]//text()').getall()
        content = [text for text in content if text.strip()]
        content = "\n".join(content).strip()

        item.add_value('title', title)
        item.add_value('date', date)
        item.add_value('link', response.url)
        item.add_value('content', content)

        return item.load_item()
Esempio n. 26
0
    def parse(self, response):
        data = json.loads(response.text)
        for index in range(len(data)):
            date = data[index]['date'].split('T')[0]
            title = data[index]['title']['rendered']
            content = data[index]['content']['rendered'] + data[index][
                'excerpt']['rendered']
            content = remove_tags(content)

            item = ItemLoader(item=CcaixaItem(), response=response)
            item.default_output_processor = TakeFirst()

            item.add_value('title', title)
            item.add_value('link', response.url)
            item.add_value('content', content)
            item.add_value('date', date)

            yield item.load_item()

        if not 'code' in data:
            self.page += 1
            yield response.follow(base.format(self.page), self.parse)
Esempio n. 27
0
 def loop_on_page(self, response):
     """
     根据页数循环
     :param response:
     :return:
     """
     this_city = response.meta['city']
     loader = QianchengItemLoader()
     info = json.loads(
         loader.get_value(
             response.text,
             TakeFirst(),
             re='window.__SEARCH_RESULT__\s*=\s*(.*?)\<\/script\>'))
     all_pages = info['total_page']
     for page in range(1, int(all_pages) + 1):
         yield Request(
             url=self.BASE_URL.format(page=str(page), city=this_city),
             headers=self.COMMON_HEADER,
             callback=self.parse_item,
             dont_filter=True,
             priority=3,
         )
Esempio n. 28
0
    def parse_post(self, response):

        date = response.xpath(
            '//div[@class="field field-name-field-article-date field-type-datetime field-label-hidden"]/span//text()'
        ).get()
        date = re.findall(r'\d+\s\w+\s\d+', date)
        title = response.xpath('//h1/text()').get().strip()
        content = response.xpath(
            '//div[@class="content"]//text()[not (ancestor::div[@class="group-related-content field-group-div"]) and not (ancestor::sup)]'
        ).getall()
        content = [p.strip() for p in content if p.strip()][1:]
        content = re.sub(pattern, "", ' '.join(content))

        item = ItemLoader(item=NbbItem(), response=response)
        item.default_output_processor = TakeFirst()

        item.add_value('title', title)
        item.add_value('link', response.url)
        item.add_value('content', content)
        item.add_value('date', date)

        return item.load_item()
Esempio n. 29
0
    def parse_article(self, response, date):
        item = ItemLoader(Article())
        item.default_output_processor = TakeFirst()

        title = response.xpath('//h1/text()').get()
        if title:
            title = title.strip()

        date = datetime.strptime(date.strip(), '%b %d, %Y')
        date = date.strftime('%Y/%m/%d')

        content = response.xpath(
            '//section[@class="content"]//text()').getall()
        content = [text for text in content if text.strip()]
        content = "\n".join(content[1:]).strip()

        item.add_value('title', title)
        item.add_value('date', date)
        item.add_value('link', response.url)
        item.add_value('content', content)

        return item.load_item()
Esempio n. 30
0
    def parse_page(self, response):
        """@url http://www.telegraph.co.uk/news/2017/02/27/grandmother-has-married-briton-27-years-deported-singapore-just/
        @returns items 1
        @scrapes bodytext bylines fetchtime firstpubtime headline
        @scrapes keywords section source summary url
        @noscrapes modtime
        """
        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        #mutate_selector_del_xpath(s, '//*[@style="display:none"]')

        l = NewsLoader(selector=s)

        # This extracts the (top-level) section from the Navigation headline
        # bar. Probably a bit fragile.
        l.add_xpath(
            'section',
            '//a[contains(@class, "header-breadcrumbs__link")]//text()',
            TakeFirst())

        l.add_xpath(
            'bylines',
            '//main//*[@itemprop="author"]//*[@itemprop="name"]//text()')

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)

        if response.xpath('//div[contains(@class, "premium-paywall")]'):
            l.add_value('notes', 'Premium paywall')

        return l.load_item()