コード例 #1
0
ファイル: ary_spider.py プロジェクト: Mustufain/news-crawler
 def parse_attr(self, response):
     """
     parses each url
     :param response:
     :return:
     """
     soup = BeautifulSoup(response.text, 'lxml')
     article_exists = is_article(soup)
     if article_exists:
         # scrapping logic here
         news_item = NewsItem()
         news_item['url'] = response.url
         news_item['text'] = response.text
         anchor_tag = get_news_author(soup)
         time_tag = get_time_tag(soup)
         title_tag = get_news_headline(soup)
         # if anchor tag is found
         if anchor_tag:
             news_item['author'] = anchor_tag.text
         # if time tag is found
         if time_tag:
             posted_date = get_posted_date(time_tag)
             news_item['posted_date'] = posted_date
         # if title tag is found
         if title_tag:
             news_item['headline'] = title_tag.text
         yield news_item
     else:
         self.logger.info(
             'url %s does not contain news post',
             response.url)
コード例 #2
0
 def setUp(self):
     self.spider = Spider(name='spider')
     self.news_response = fake_response("data/news.html")
     self.sports_response = fake_response('data/sports.html')
     self.recent_news_response = fake_response('data/recent_news.html')
     self.recent_sports_response = fake_response('data/recent_sports.html')
     self.pipeline = NewsTextPipeline()
     self.item = NewsItem()
コード例 #3
0
 def parse_article(self, response):
     logging.debug(" **** RECEIVING %s ti save **** " % (str(response)))
     item = NewsItem()
     item['date'] = self.get_date(response)
     item['body'] = self.get_body(response)
     item['title'] = self.get_title(response)
     item['link'] = self.get_url_suffix(response.url)
     item['full_link'] = response.url
     item['source_id'] = self.source_id
     item['keywords'] = self.keywords
     return item
コード例 #4
0
ファイル: news_spider.py プロジェクト: phacus/nicetomeetyou
    def get_content(self, response):
        item = NewsItem()

        title = response.css('h1::text').get()
        pub_time = response.css('div.shareBar__info--author span::text').get()
        content = response.css('p').getall()[1:]

        item['title'] = title
        item['pub_time'] = pub_time
        item['content'] = remove_tags(''.join(content))

        yield item
コード例 #5
0
    def parse_news(self, response):

        # Get news page main element
        news_div = response.xpath("//div[@class='conteudo-pagina']")

        # Parse page items content
        date = news_div.xpath("//span[@class='data']/text()").extract_first()
        title = news_div.xpath("//h1/text()").extract_first().strip()
        url = response.url
        text = ' '.join(string.strip() for string in news_div.xpath("//div[@class='conteudo-materia']//p//text()").getall())

        # Instantiate item with the parsed content
        news = NewsItem(date=date, title=title, url=url, text=text)

        yield news
コード例 #6
0
 def setUp(self):
     self.pipeline = DuplicatesPipeline()
     self.spider = Spider(name='spider')
     self.item = NewsItem()
コード例 #7
0
 def setUp(self):
     self.spider = Spider(name='spider')
     self.item = NewsItem()
     self.pipeline = DropEmptyRequiredFieldsPipeline()
コード例 #8
0
 def setUp(self):
     self.spider = Spider(name='spider')
     self.pipeline = NewsPlaceMentionedPipeline()
     self.item = NewsItem()
コード例 #9
0
 def setUp(self):
     self.pipeline = MongoPipeline('mongo_uri', 'mongo_db')
     self.spider = Spider(name='spider')
     self.item = NewsItem()