def parse_post(self, response, date): title = response.xpath( '//div[@class="lead top-margin3-xs"]/p//text()').get() if not title: title = response.xpath( '//div[@class="lead top-margin3-xs"]/text()').get() content = response.xpath( '//div[@class="body-copy4 parbase section"]//text()[not (ancestor::script)]' ).getall() content = [p.strip() for p in content if p.strip()] content = re.sub(pattern, "", ' '.join(content)) item = ItemLoader(item=BbanksaItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('link', response.url) item.add_value('content', content) item.add_value('date', date) yield item.load_item()
def parse_article(self, response, date): if 'pdf' in response.url: return item = ItemLoader(Article()) item.default_output_processor = TakeFirst() title = response.xpath('//h2//text()').get() if title: title = title.strip() content = response.xpath('//div[@class="col-sm-9"]//text()').getall() content = [text for text in content if text.strip()] content = "\n".join(content).strip() item.add_value('title', title) item.add_value('date', date) item.add_value('link', response.url) item.add_value('content', content) return item.load_item()
def parse_article(self, response, date): if 'pdf' in response.url.lower(): return item = ItemLoader(Article()) item.default_output_processor = TakeFirst() title = response.xpath('//title/text()').get() if title: title = title.split('-')[0].strip() content = response.xpath('//div[@data-emptytext="Text"]//text()').getall() content = [text.strip() for text in content if text.strip() and '{' not in text] content = " ".join(content).strip() item.add_value('title', title) item.add_value('date', date) item.add_value('link', response.url) item.add_value('content', content) return item.load_item()
def parse_post(self, response): try: date = response.xpath('//h1/text()').get().split('–')[0].strip() title = response.xpath('//h1/text()').get().split('–')[1].strip() except IndexError: date = response.xpath('//h1/text()').get().split('-')[0].strip() title = response.xpath('//h1/text()').get().split('-')[1].strip() content = response.xpath( '//div[@class="page-content"]//text()').getall() content = [p.strip() for p in content if p.strip()] content = re.sub(pattern, "", ' '.join(content)) item = ItemLoader(item=NnovoItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('link', response.url) item.add_value('content', content) item.add_value('date', date) yield item.load_item()
def parse_post(self, response): date = response.xpath( '//span[contains(@class,"story-date")]/text()').get() date = re.findall(r'\d+\s\w+\s\d+', date) title = response.xpath('//h1/text()').get() content = response.xpath( '//div[@class="story__column story__column--content"]//text()[not (ancestor::h1 or ancestor::p[@class="story-intro"])]' ).getall() + response.xpath( '//section[@class="story__content"]//text()').getall() content = [p.strip() for p in content if p.strip()] content = re.sub(pattern, "", ' '.join(content)) item = ItemLoader(item=CbcItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('link', response.url) item.add_value('content', content) item.add_value('date', date) yield item.load_item()
def parse_post(self, response): data = requests.request("GET", response.url, headers=headers, data=payload) data = scrapy.Selector(text=data.text) title = data.xpath('//h3[@class="HeaderTitle"]/span/text()').get() description = data.xpath( '//div[@class="PageContent"]//p//text()[normalize-space() and not(ancestor::i)]' ).getall() description = [p.strip() for p in description if '{' not in p] description = ' '.join(description).strip() date = data.xpath('//p[@style="text-align:right;"]/i/text()').get() item = ItemLoader(item=AbkeahliItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('description', description) item.add_value('date', date) return item.load_item()
def parse_post(self, response): date = response.xpath( '//div[@class="news nyhedsdetaljedato col-md-12"]/text()').get() title = response.xpath( '//div[@class="news h2 nyhedsdetaljeteaser col-md-12"]/text() | //h1/text()' ).get() content = response.xpath( '//div[@class="nyhedsdetaljetekst col-md-6"]//text() | //div[@class="col-md-8 txtbox"]//text()' ).getall() content = [p.strip() for p in content if p.strip()] content = re.sub(pattern, "", ' '.join(content)) item = ItemLoader(item=SalingItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('link', response.url) item.add_value('content', content) item.add_value('date', date) yield item.load_item()
def parse_post(self, response): title = response.xpath( '//div[@class="FrutigerNeueMedium_18 color_green align_left"]//text()' ).get() description = response.xpath( '//div[@class="proximanova_16_light line_height_20"]//text()[normalize-space()]' ).getall() description = [p.strip() for p in description] description = ' '.join(description).strip() try: date = re.findall(r'\d{1,2}\s[a-z]{3,}\s\d{4}', description)[0] except: date = '' item = ItemLoader(item=CrcentoitItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('description', description) item.add_value('date', date) return item.load_item()
class NewsItems(scrapy.Item): UrlHash = scrapy.Field(input_processor=MapCompose(u2int),output_processor=TakeFirst()) Link = scrapy.Field( input_processor=MapCompose(str.strip),output_processor=TakeFirst()) Epic = scrapy.Field( input_processor=MapCompose(str.strip),output_processor=TakeFirst()) Article = scrapy.Field(input_processor=MapCompose(StrProcess),output_processor=TakeFirst()) SHash = scrapy.Field( input_processor=MapCompose(u2int),output_processor=TakeFirst()) Artlen = scrapy.Field(input_processor=MapCompose(u2int),output_processor=TakeFirst()) def __str__(self): return "" # def __repr__(self): # """only print out attr1 after exiting the Pipeline""" # return repr({"Title": self.Title})
def parse_article(self, response): if 'pdf' in response.url or 'jpg' in response.url or 'mp4' in response.url: return item = ItemLoader(Article()) item.default_output_processor = TakeFirst() title = response.xpath('//h1//text()').getall() or response.xpath( '//h2//text()').getall() if not title: return title = [text for text in title if text.strip()] title = "\n".join(title).strip() date = response.xpath( '//div[@class="elementor-widget-container"]/div[@class="elementor-text-editor' ' elementor-clearfix"]/p/text()').get() if date: date = date.strip() if not date.split()[-1].isnumeric(): date = '' else: date = " ".join(date.split()[-3:]) elif not response.xpath('//h1//text()').getall(): date = response.xpath('//h2/text()').get().split()[0] else: date = '' content = response.xpath('//div[@class="elementor-inner"]//text()').getall() or \ response.xpath('//div[@itemprop="text"]//text()').getall() content = [text for text in content if text.strip()] content = "\n".join(content).strip() item.add_value('title', title) item.add_value('date', date) item.add_value('link', response.url) item.add_value('content', content) return item.load_item()
def parse_article(self, response): item = ItemLoader(Article()) item.default_output_processor = TakeFirst() title = response.xpath('//h1/text()').get().strip() date = response.xpath( '//span[@itemprop="datePublished"]/text()').get().strip() try: date = datetime.strptime(date, '%B %d, %Y') except: return date = date.strftime('%Y/%m/%d') content = response.xpath('//div[@class="C06 "]//text()').getall() content = [text for text in content if text.strip()] content = "\n".join(content).strip() item.add_value('title', title) item.add_value('date', date) item.add_value('link', response.url) item.add_value('content', content) return item.load_item()
def parse_post(self, response): date = response.xpath('(//div[@class="Normal"])[1]/p/text()').get() try: date = re.findall(r'\w+\s\d+\,\s\d+', date) except TypeError: date = "" title = response.xpath('//h1/text()').get() content = response.xpath( '//div[@class="Normal"]//text()[not (ancestor::div[@class="sidebarHIDE"] or ancestor::div[@class="container topFooterpadding"])]' ).getall() content = [p.strip() for p in content if p.strip()] content = re.sub(pattern, "", ' '.join(content)) item = ItemLoader(item=CcdbtItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('link', response.url) item.add_value('content', content) item.add_value('date', date) yield item.load_item()
def parse_article(self, response): item = ItemLoader(Article()) item.default_output_processor = TakeFirst() title = response.xpath('//h2/text()').get() if title: title = title.strip() date = response.xpath('//h4/text()').get() if date: date = date.strip() content = response.xpath('//p[@align="justify"]//text()').getall() or response.xpath('//p//text()').getall() content = [text for text in content if text.strip()] content = "\n".join(content).strip() item.add_value('title', title) item.add_value('date', date) item.add_value('link', response.url) item.add_value('content', content) return item.load_item()
def parse_post(self, response): date = '-' title = response.xpath( '//div[@class="site-main-content"]/h3/strong/text() | //h1/text()' ).get() if not title: title = 'NordfynsBank' content = response.xpath( '//div[@class="site-main-content"]//text()[not (ancestor::strong) and not(ancestor::h1)]' ).getall() content = [p.strip() for p in content if p.strip()] content = re.sub(pattern, "", ' '.join(content)) item = ItemLoader(item=NordfyItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('link', response.url) item.add_value('content', content) item.add_value('date', date) yield item.load_item()
def parse_post(self, response): date = response.xpath( '//ul[@class="blog__inner__layer__header__icons list-inline"]/li/span//text()' ).get() title = response.xpath( '//h1[@class="blog__inner__layer__header__title"]/text()').get( ).strip() content = response.xpath( '//div[@class="entry-content"]//text()').getall() content = [p.strip() for p in content if p.strip()] content = re.sub(pattern, "", ' '.join(content)) item = ItemLoader(item=UnicreditItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('link', response.url) item.add_value('content', content) item.add_value('date', date) return item.load_item()
def parse_post(self, response): date = response.xpath( '//p[@class="index-module--date--2S-d0"]/text()').get() title = response.xpath( '//h1[@class="index-module--title--mvcCi"]/text()').get() content = response.xpath( '//h4[@class="index-module--manchet--37gxk"]//text()').getall( ) + response.xpath( '//div[@class="index-module--sectionsWrapper--1bEQn"]//text()' ).getall() content = [p.strip() for p in content if p.strip()] content = re.sub(pattern, "", ' '.join(content)) item = ItemLoader(item=LunarItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('link', response.url) item.add_value('content', content) item.add_value('date', date) yield item.load_item()
class _VauvaCommentItem(Item): """ Returned comment fields: * author (str): Author of the comment. * date (str): Publish time of the comment. * quotes (list of str): List of quotes in the comment. * content (str): Contents of the comment. * upvotes (int): Upvotes of the comment. * downvotes (int): Downvotes of the comment. """ author = Field(input_processor=strip_join, output_processor=TakeFirst()) date = Field(input_processor=strip_join, output_processor=Compose(strip_elements, TakeFirst())) quotes = Field(input_processor=drop_empty_elements, output_processor=Identity()) content = Field(input_processor=strip_join, output_processor=TakeFirst()) upvotes = Field(input_processor=MapCompose(TakeFirst(), safe_cast_int), output_processor=TakeFirst()) downvotes = Field(input_processor=MapCompose(TakeFirst(), safe_cast_int), output_processor=TakeFirst())
def parse_post(self, response, date): title = [ response.xpath( '//div[@class="container-content-inner"]/h3[@class="text-center small_section_title"]/text()' ).get() + response.xpath( '//div[@class="container-content-inner"]/h1/text()').get() ] content = response.xpath( '//div[@itemprop="articleBody"]//text()').getall() content = [p.strip() for p in content if p.strip()] content = re.sub(pattern, "", ' '.join(content)) item = ItemLoader(item=CdbItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('link', response.url) item.add_value('content', content) item.add_value('date', date) yield item.load_item()
def parse_post(self, response, date): title = response.xpath( '//div[@class="wd_newsfeed_releases-detail"]/div[@class="wd_title wd_language_left"]/text()' ).get() content = response.xpath( '//div[@id="wd_printable_content"]//text()[not (ancestor::style) and not(ancestor::div[@class="wd_title wd_language_left"])]' ).getall() content = [p.strip() for p in content if p.strip()] if 'About CIBC' in content: content = content[:-6] content = re.sub(pattern, "", ' '.join(content)) item = ItemLoader(item=CibcItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('link', response.url) item.add_value('content', content) item.add_value('date', date) yield item.load_item()
def parse_post(self, response): articles = response.xpath('//dl[@class="faq"]/dt') length = len(articles) for index in range(length): item = ItemLoader(item=CroatiaItem(), response=response) item.default_output_processor = TakeFirst() date = response.xpath(f'(//dl[@class="faq"]/dt)[{index + 1}]//span//text()').get() title = response.xpath(f'(//dl[@class="faq"]/dt)[{index + 1}]//text()[2]').get() if title: title =title.strip() content = response.xpath(f'(//dl[@class="faq"]/dd)[{index + 1}]//text()').getall() content = [p.strip() for p in content if p.strip()] content = re.sub(pattern, "",' '.join(content)) item.add_value('title', title) item.add_value('link', response.url) item.add_value('content', content) item.add_value('date', date) yield item.load_item()
def parse_article(self, response): item = ItemLoader(Article()) item.default_output_processor = TakeFirst() title = response.xpath('//h1[@class="content-title"]//text()').get() if title: title = title.strip() date = response.xpath('//div[@class="date mb5"]//text()').get() if date: date = date.strip() content = response.xpath('//div[@class="content-body"]//text()').getall() content = [text for text in content if text.strip()] content = "\n".join(content[2:]).strip() item.add_value('title', title) item.add_value('date', date) item.add_value('link', response.url) item.add_value('content', content) return item.load_item()
def parse_post(self, response): if response.url[-10:] == 'izjava-kfi': return title = response.xpath('//h1//text()[normalize-space()]').get() description = response.xpath('//div[@class="contents"]//text()[normalize-space()and not(ancestor::a | ancestor::h1)]').getall() description = [p.strip() for p in description] description = ' '.join(description).strip() try: date = re.findall(r'\d+\.?\s*\w+\s\d{4}', description)[0] except: print(response.url) date = '' description = re.sub(r'\d+\.?\s*\w+\s\d{4}', '', description) item = ItemLoader(item=NlbsiItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('description', description) item.add_value('date', date) return item.load_item()
def parse_post(self, response): date = response.xpath( '//div[@class="blog-item-data"]//text()').getall() date = ''.join([p.strip() for p in date if p.strip()]) title = response.xpath( '//h2[@class="blog-item-title font-alt"]//text()').get() content = response.xpath( '//div[@class="field field-name-body field-type-text-with-summary field-label-hidden"]//text()' ).getall() content = [p.strip() for p in content if p.strip()] content = re.sub(pattern, "", ' '.join(content)) item = ItemLoader(item=AncdordiaItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('link', response.url) item.add_value('content', content) item.add_value('date', date) yield item.load_item()
def parse_article(self, response, date): item = ItemLoader(Article()) item.default_output_processor = TakeFirst() title = response.xpath('//h3[@class="no-pad-top"]/text()').get() if title: title = title.strip() date = datetime.strptime(date.strip(), '%d.%m.%Y') date = date.strftime('%Y/%m/%d') content = response.xpath( '//div[@class="news-box-wrap no-border-bottom"]//text()').getall() content = [text for text in content if text.strip()] content = "\n".join(content[1:]).strip() item.add_value('title', title) item.add_value('date', date) item.add_value('link', response.url) item.add_value('content', content) return item.load_item()
def parse_article(self, response, date): item = ItemLoader(Article()) item.default_output_processor = TakeFirst() title = " ".join(response.xpath('//h1//text()').getall()) if title: title = title.strip() if date: date = datetime.strptime(date.strip(), '%d/%m/%Y') date = date.strftime('%Y/%m/%d') content = response.xpath('//div[@itemprop="articleBody"]//text()').getall() content = [text for text in content if text.strip()] content = "\n".join(content).strip() item.add_value('title', title) item.add_value('date', date) item.add_value('link', response.url) item.add_value('content', content) return item.load_item()
def parse(self, response): data = json.loads(response.text) for index in range(len(data)): date = data[index]['date'].split('T')[0] title = data[index]['title']['rendered'] content = data[index]['content']['rendered'] + data[index][ 'excerpt']['rendered'] content = remove_tags(content) item = ItemLoader(item=CcaixaItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('link', response.url) item.add_value('content', content) item.add_value('date', date) yield item.load_item() if not 'code' in data: self.page += 1 yield response.follow(base.format(self.page), self.parse)
def loop_on_page(self, response): """ 根据页数循环 :param response: :return: """ this_city = response.meta['city'] loader = QianchengItemLoader() info = json.loads( loader.get_value( response.text, TakeFirst(), re='window.__SEARCH_RESULT__\s*=\s*(.*?)\<\/script\>')) all_pages = info['total_page'] for page in range(1, int(all_pages) + 1): yield Request( url=self.BASE_URL.format(page=str(page), city=this_city), headers=self.COMMON_HEADER, callback=self.parse_item, dont_filter=True, priority=3, )
def parse_post(self, response): date = response.xpath( '//div[@class="field field-name-field-article-date field-type-datetime field-label-hidden"]/span//text()' ).get() date = re.findall(r'\d+\s\w+\s\d+', date) title = response.xpath('//h1/text()').get().strip() content = response.xpath( '//div[@class="content"]//text()[not (ancestor::div[@class="group-related-content field-group-div"]) and not (ancestor::sup)]' ).getall() content = [p.strip() for p in content if p.strip()][1:] content = re.sub(pattern, "", ' '.join(content)) item = ItemLoader(item=NbbItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('link', response.url) item.add_value('content', content) item.add_value('date', date) return item.load_item()
def parse_article(self, response, date): item = ItemLoader(Article()) item.default_output_processor = TakeFirst() title = response.xpath('//h1/text()').get() if title: title = title.strip() date = datetime.strptime(date.strip(), '%b %d, %Y') date = date.strftime('%Y/%m/%d') content = response.xpath( '//section[@class="content"]//text()').getall() content = [text for text in content if text.strip()] content = "\n".join(content[1:]).strip() item.add_value('title', title) item.add_value('date', date) item.add_value('link', response.url) item.add_value('content', content) return item.load_item()
def parse_page(self, response): """@url http://www.telegraph.co.uk/news/2017/02/27/grandmother-has-married-briton-27-years-deported-singapore-just/ @returns items 1 @scrapes bodytext bylines fetchtime firstpubtime headline @scrapes keywords section source summary url @noscrapes modtime """ s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. #mutate_selector_del_xpath(s, '//*[@style="display:none"]') l = NewsLoader(selector=s) # This extracts the (top-level) section from the Navigation headline # bar. Probably a bit fragile. l.add_xpath( 'section', '//a[contains(@class, "header-breadcrumbs__link")]//text()', TakeFirst()) l.add_xpath( 'bylines', '//main//*[@itemprop="author"]//*[@itemprop="name"]//text()') # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) if response.xpath('//div[contains(@class, "premium-paywall")]'): l.add_value('notes', 'Premium paywall') return l.load_item()