Beispiel #1
0
    def get_article(self, url):
        '''Implementation for getting an article from the CBC.

    url: A URL in the cbc.ca/news/* domain.

    Returns: The Article representing the article at that url, or None if
    unable to scrape the article.
    '''
        html = helpers.get_content(url)
        if not html:
            return None

        soup = BeautifulSoup(html)

        try:
            headline = soup.h1.string
        except AttributeError:
            log.error('Exception trying to scrape CBC headline from %s' %
                      (url))
            return None

        article = soup.find('div', attrs={'class': 'story-content'})
        paragraphs = article.find_all('p', attrs={'class': None})
        body = ' '.join([p.get_text() for p in paragraphs])
        log.info(headline)
        log.info(body)
        return news_interface.Article(headline, body, url, news_orgs.CBC)
Beispiel #2
0
    def get_article(self, url):
        '''Implementation for getting an article from CNN.

    Args:
      url: A URL in the www.cnn.* domain.

    Returns:
      The Article representing the article at that url.
    '''
        html = helpers.get_content(url)
        if not html:
            return None

        soup = BeautifulSoup(html)
        a = soup.find("title")
        k = a.text.split("-")
        headline = k[0]
        date = k[1]
        c = soup.findAll("p", attrs={'class': 'zn-body__paragraph'})
        body = ""
        for paragraph in c:
            try:
                body += paragraph.text.decode("utf-8").replace("\"", "'") + " "
            except UnicodeEncodeError:
                pass
        log.info(headline)
        log.info(body)
        return news_interface.Article(headline, body, url, news_orgs.CNN)
Beispiel #3
0
    def get_article(self, url):
        '''Implementation for getting an article from the Guardian.

    url: A URL in the guardian.com domain.

    Returns: The Article representing the article at that url.
    '''
        html = helpers.get_content(url)
        if not html:
            return None

        soup = BeautifulSoup(html)
        headline = soup.h1.string.strip('\n')

        if url.split('.com/')[1].startswith('theguardian'):
            article = soup.find('div',
                                attrs={'class': 'flexible-content-body'})
        else:
            article = soup.find('div',
                                attrs={'class': 'content__article-body'})
        paragraphs = article.find_all('p', attrs={'class': None})
        body = ' '.join([p.get_text() for p in paragraphs])

        log.info(headline)
        log.info(body)
        return news_interface.Article(headline, body, url, news_orgs.GUARDIAN)
Beispiel #4
0
    def get_article(self, url):
        '''Implementation for getting an article from the Russia Today.

    url: A URL in the russia_today.com domain.

    Returns: The Article representing the article at that url.
    '''
        html = helpers.get_content(url)
        if not html:
            return None

        soup = BeautifulSoup(html)
        headline = helpers.decode(soup.h1.string)

        article = soup.find('div', attrs={'class': 'cont-wp'})
        paragraphs = article.find_all('p', attrs={'class': None})
        p_text = [helpers.decode(p.get_text()) for p in paragraphs]
        # Get rid of 'Tags' and 'Trends' headers, and 'READ MORE' links
        body = ' '.join([
            p for p in p_text
            if not (p.startswith('\nREAD') or p == 'Tags' or p == 'Trends')
        ])

        log.info(headline)
        log.info(body)
        return news_interface.Article(headline, body, url,
                                      news_orgs.RUSSIA_TODAY)
  def get_article(self, url):
    '''Implementation for getting an article from the Globe and Mail.

    url: A URL in the theglobeandmail.com/* domain.

    Returns: The Article representing the article at that url.
    '''
    html = helpers.get_content(url)
    if not html:
      return None

    soup = BeautifulSoup(html)

    soup.h1.a.extract()
    headline = soup.h1.get_text().encode('ascii', 'ignore').strip('\n')
    article = soup.find('div', attrs={'class': 'entry-content'})

    # Remove other content that is inline with the article text
    [div.extract() for div in article.find_all('div', attrs={'class': 'entry-related'})]
    [aside.extract() for aside in article.find_all('aside')]

    paragraphs = article.find_all('p', attrs={'class': None})
    body = ' '.join([p.get_text().encode('ascii', 'ignore') for p in paragraphs])

    log.info(headline)
    log.info(body)
    return news_interface.Article(headline, body, url, news_orgs.GLOBE_AND_MAIL)
Beispiel #6
0
    def get_article(self, url):
        '''Implementation for getting an article from JPost.

    Args:
      url: A URL in the www.jpost.com/* domain.

    Returns:
      The Article representing the article at that url.
    '''
        html = helpers.get_content(url)
        if not html:
            return None

        soup = BeautifulSoup(html)

        try:
            a = soup.find('h1', attrs={'class': 'article-title'})
            headline = a.text.strip().strip('\r\n')
            paragraphs = soup.find("div", {"class": "article-text"})
            article = paragraphs.find("p")
        except Exception as e:
            log.error('Error scraping JPost article at %s: %s' % (url, e))

        body = article.text

        log.info(headline)
        log.info(body)
        return news_interface.Article(headline, body, url, news_orgs.JPOST)
Beispiel #7
0
    def get_article(self, url):
        '''Returns an Article representing the article at url.'''
        try:
            html = helpers.get_content(url)
            if not html:
                return None

            soup = BeautifulSoup(html)
            headline = self.get_headline(soup)
            body = self.get_body(soup)
            date = self.get_date(soup)
        except Exception as e:
            logger.log.error(
                "Hit exception on line number %s getting article for %s:"
                " %s" % (sys.exc_info()[-1].tb_lineno, url, e))
            return None

        try:
            headline = helpers.decode(headline)
            body = helpers.decode(body)
            date = helpers.decode(date)
        except Exception as e:
            logger.log.error('Error on line %s decoding url %s: %s' %
                             (sys.exc_info()[-1].tb_lineno, url, e))
            return None

        logger.log.info('URL: %s' % url)
        logger.log.info('headline: %s' % headline)
        logger.log.info('Body: %s' % body)

        return news_interface.Article(headline, body, url, self.news_org, date)
Beispiel #8
0
    def get_article(self, url):
        '''Implementation for getting an article from the NYTimes.

    url: A URL in the ny_times.com domain.

    Returns: The Article representing the article at that url.
    '''
        html = helpers.get_content(url)
        if not html:
            return None

        soup = BeautifulSoup(html)
        headline = helpers.decode(soup.h1.string)

        try:
            article = soup.find('div', attrs={'class': 'articleBody'})
            paragraphs = article.find_all('p',
                                          attrs={'itemprop': 'articleBody'})
        except AttributeError:
            # this article's html uses different attributes... sigh...
            # Hopefully there are only 2 versions
            article = soup.find('div', attrs={'class': 'story-body'})
            paragraphs = article.find_all('p',
                                          attrs={'class': 'story-content'})

        p_text = [helpers.decode(p.get_text()) for p in paragraphs]
        body = ' '.join([p for p in p_text])

        log.info(headline)
        log.info(body)
        return news_interface.Article(headline, body, url, news_orgs.NY_TIMES)
Beispiel #9
0
  def get_article(self, url):
    '''Implementation for getting an article from USA Today.

    url: A URL in the http://www.usatoday.com/story/* domain.

    Returns: The Article representing the article at that url.
    '''
    html = helpers.get_content(url)
    if not html:
      return None

    soup = BeautifulSoup(html)
    article = soup.article
    headline = helpers.decode(article.h1.string)
    paragraphs = article.find_all('p', attrs={'class': None})
    body = ' '.join([helpers.decode(p.get_text()) for p in paragraphs])
    return news_interface.Article(headline, body, url, news_orgs.USA_TODAY)
Beispiel #10
0
    def get_article(self, url):
        '''Implementation for getting an article from the New York Post.

    url: A URL in the nypost.com domain.

    Returns: The Article representing the article at that url.
    '''
        html = helpers.get_content(url)
        if not html:
            return None

        soup = BeautifulSoup(html)
        headline = helpers.decode(soup.h1.a.string)
        article = soup.find('div', attrs={'class': 'entry-content'})
        paragraphs = article.find_all('p', attrs={'class': None})
        body = ' '.join([helpers.decode(p.get_text()) for p in paragraphs])
        log.info(headline)
        log.info(body)
        return news_interface.Article(headline, body, url, news_orgs.NY_POST)
Beispiel #11
0
    def get_article(self, url):
        '''Implementation for getting an article from BBC.

    url: A URL in the www.bbc.* domain.

    Returns: The Article representing the article at that url.
    '''
        html = helpers.get_content(url)
        if not html:
            return None

        soup = BeautifulSoup(html)
        headline = soup.h1.string
        article = soup.find('div', attrs={'class': 'story-body'})
        paragraphs = article.find_all('p', attrs={'class': None})
        body = ' '.join([p.get_text() for p in paragraphs])
        log.info(headline)
        log.info(body)
        return news_interface.Article(headline, body, url, news_orgs.BBC)
Beispiel #12
0
    def get_article(self, url):
        '''Implementation for getting an article from Al Jazeera.

    Args:
      url: A URL in the aljazeera.* domain.

    Returns:
      The Article representing the article at that url, or None if unable to
      get the Article.
    '''
        html = helpers.get_content(url)
        if not html:
            return None

        soup = BeautifulSoup(html)

        headline = None
        potential_classes = ["heading-story", "articleOpinion-title"]
        for h1_class in potential_classes:
            try:
                headline = soup.find("h1", {"class": h1_class}).string
                break
            except AttributeError:
                continue
        if not headline:
            log.error(
                'Exception trying to scrape Al Jazeera headline from %s' %
                (url))
            return None

        headline = helpers.decode(headline)

        try:
            paragraphs = soup.find("div", {"class": "article-body"})
            article = paragraphs.findAll("p")
        except AttributeError:
            paragraphs = soup.find("div", {"class": "text"})
            article = paragraphs.findAll("p")
        body = ' '.join([helpers.decode(p.text) for p in article])
        #log.info(headline)
        #log.info(body)
        return news_interface.Article(headline, body, url, news_orgs.ALJAZEERA)
Beispiel #13
0
  def get_article(self, url):
    '''Implementation for getting an article from Todays Zaman.

    Args:
      url: A URL in the www.todayszaman.com/* domain.

    Returns:
      The Article representing the article at that url.
    '''
    html = helpers.get_content(url)
    if not html:
      return None

    soup = BeautifulSoup(html)
    a = soup.find("title")
    headline = helpers.decode(a.text)
    paragraphs = soup.find("div", {"id": "newsText"})
    article = paragraphs.findAll("p")
    body = ' '.join([helpers.decode(p.text) for p in article])
    log.info(headline)
    log.info(body)
    return news_interface.Article(headline, body, url, news_orgs.TODAYS_ZAMAN)
Beispiel #14
0
    def get_article(self, url):
        '''Implementation for getting an article from REUTERS.

    url: A URL in the www.reuters.com* domain.

    Returns: The Article representing the article at that url.
    '''
        html = helpers.get_content(url)
        if not html:
            return None

        soup = BeautifulSoup(html)
        headline_div = soup.find('div',
                                 attrs={'class': 'column1 gridPanel grid8'})
        headline = helpers.decode(headline_div.h1.string)
        paragraphs = soup.find('div',
                               attrs={
                                   'class': 'column1 gridPanel grid8'
                               }).findAll("p")
        body = ' '.join([helpers.decode(p.text) for p in paragraphs])
        log.info(headline)
        log.info(body)
        return news_interface.Article(headline, body, url, news_orgs.REUTERS)
Beispiel #15
0
    def get_article(self, url):
        '''Implementation for getting an article from Times of Israel.

    Args:
      url: A URL in the www.timesofisrael.com/* domain.

    Returns:
      The Article representing the article at that url.
    '''
        html = helpers.get_content(url)
        if not html:
            return None

        soup = BeautifulSoup(html)

        h1 = soup.find('h1', attrs={'class': 'headline'})
        headline = helpers.decode(h1.text)
        paragraphs = soup.findAll("p", {"itemprop": "articleBody"})
        body = ' '.join([helpers.decode(p.text) for p in paragraphs])

        log.info(headline)
        log.info(body)
        return news_interface.Article(headline, body, url,
                                      news_orgs.TIMES_OF_ISRAEL)