def parse_attr(self, response): """ parses each url :param response: :return: """ soup = BeautifulSoup(response.text, 'lxml') article_exists = is_article(soup) if article_exists: # scrapping logic here news_item = NewsItem() news_item['url'] = response.url news_item['text'] = response.text anchor_tag = get_news_author(soup) time_tag = get_time_tag(soup) title_tag = get_news_headline(soup) # if anchor tag is found if anchor_tag: news_item['author'] = anchor_tag.text # if time tag is found if time_tag: posted_date = get_posted_date(time_tag) news_item['posted_date'] = posted_date # if title tag is found if title_tag: news_item['headline'] = title_tag.text yield news_item else: self.logger.info( 'url %s does not contain news post', response.url)
def setUp(self): self.spider = Spider(name='spider') self.news_response = fake_response("data/news.html") self.sports_response = fake_response('data/sports.html') self.recent_news_response = fake_response('data/recent_news.html') self.recent_sports_response = fake_response('data/recent_sports.html') self.pipeline = NewsTextPipeline() self.item = NewsItem()
def parse_article(self, response): logging.debug(" **** RECEIVING %s ti save **** " % (str(response))) item = NewsItem() item['date'] = self.get_date(response) item['body'] = self.get_body(response) item['title'] = self.get_title(response) item['link'] = self.get_url_suffix(response.url) item['full_link'] = response.url item['source_id'] = self.source_id item['keywords'] = self.keywords return item
def get_content(self, response): item = NewsItem() title = response.css('h1::text').get() pub_time = response.css('div.shareBar__info--author span::text').get() content = response.css('p').getall()[1:] item['title'] = title item['pub_time'] = pub_time item['content'] = remove_tags(''.join(content)) yield item
def parse_news(self, response): # Get news page main element news_div = response.xpath("//div[@class='conteudo-pagina']") # Parse page items content date = news_div.xpath("//span[@class='data']/text()").extract_first() title = news_div.xpath("//h1/text()").extract_first().strip() url = response.url text = ' '.join(string.strip() for string in news_div.xpath("//div[@class='conteudo-materia']//p//text()").getall()) # Instantiate item with the parsed content news = NewsItem(date=date, title=title, url=url, text=text) yield news
def setUp(self): self.pipeline = DuplicatesPipeline() self.spider = Spider(name='spider') self.item = NewsItem()
def setUp(self): self.spider = Spider(name='spider') self.item = NewsItem() self.pipeline = DropEmptyRequiredFieldsPipeline()
def setUp(self): self.spider = Spider(name='spider') self.pipeline = NewsPlaceMentionedPipeline() self.item = NewsItem()
def setUp(self): self.pipeline = MongoPipeline('mongo_uri', 'mongo_db') self.spider = Spider(name='spider') self.item = NewsItem()