Example #1
0
 def test_article_build(self):
     url = "http://abcnews.go.com/blogs/politics/2013/12/states-cite-surge-in-obamacare-sign-ups-ahead-of-first-deadline/"
     article = newspaper.build_article(url)
     assert isinstance(article, Article) == True
     article.download()
     article.parse()
     article.nlp()
Example #2
0
def article_extractor(newspaper_url, title_topic=None):
    """
    This takes a root url and pulls all articles the newspapers package can access, without caching.
    :param newspaper_url: url of homepage
    :param title_topic: for filtering by topic in title, if None get all
    :return: dataframe with article title and text, nothing else
    TODO: parse extra fields and include in data frame, curr issue is this breaks pickle serialization
    """
    dd = defaultdict(list)
    source = newspaper.build(newspaper_url, memoize_articles=False)
    arts = [i.url for i in source.articles]
    print("Articles: " + str(len(arts)))
    if title_topic is None:
        relevant_arts = [i for i in arts]
    else:
        relevant_arts = [i for i in arts if title_topic in i]

    for i in relevant_arts:
        art = newspaper.build_article(i)
        try:
            art.download()
            art.parse()
            #for key in vars(art).keys():
            #    if key == 'config' or key == 'extractor' or key == 'top_node' or key == 'clean_top_node':
            #        continue
            #    dd[key].append(vars(art)[key])
            dd['title'].append(art.title)
            dd['text'].append(art.text)
        except newspaper.article.ArticleException:
            continue
    return pd.DataFrame.from_dict(dd)
Example #3
0
    def get(self):
        args = article_parser.parse_args()
        url = args['url']
        article = newspaper.build_article(url, config)
        article.download()
        article.parse()
        article.nlp()

        return {
            'url': article.url,
            'title': article.title,
            'top_image': article.top_img,
            'images': [x for x in article.imgs],
            'text': article.text,
            'html': article.article_html,
            'keywords': article.keywords,
            'authors': article.authors,
            'summary': article.summary,
            'meta_description': article.meta_description,
            'meta_lang': article.meta_lang,
            'meta_favicon': article.meta_favicon,
            'meta_keywords': article.meta_keywords,
            'canonical_link': article.canonical_link,
            'tags': [unicode(x) for x in article.tags],
            'movies': article.movies,
            'additional_data': article.additional_data,
        }
Example #4
0
 def test_article_build(self):
     url = 'http://abcnews.go.com/blogs/politics/2013/12/states-cite-surge-in-obamacare-sign-ups-ahead-of-first-deadline/'
     article = newspaper.build_article(url)
     assert isinstance(article, Article) == True
     article.download()
     article.parse()
     article.nlp()
Example #5
0
 def test_article_build(self):
     url = ('http://abcnews.go.com/blogs/politics/2013/12/'
            'states-cite-surge-in-obamacare-sign-ups-ahead'
            '-of-first-deadline/')
     article = newspaper.build_article(url)
     assert isinstance(article, Article) is True
     article.build()
     article.nlp()
Example #6
0
 def test_article_build(self):
     url = ('http://abcnews.go.com/blogs/politics/2013/12/'
            'states-cite-surge-in-obamacare-sign-ups-ahead'
            '-of-first-deadline/')
     article = newspaper.build_article(url)
     assert isinstance(article, Article) is True
     article.build()
     article.nlp()
Example #7
0
def _parse_newspaper(article):
    newspaper_article = newspaper.build_article(article.url)
    newspaper_article.set_html(article.html)
    newspaper_article.parse()
    article.text = article.text or newspaper_article.text
    article.title = article.title or newspaper_article.title
    article.authors = article.authors or newspaper_article.authors
    if not article.keywords:
        keywords = newspaper_article.keywords or []
        other_keywords = newspaper_article.meta_keywords or []
        article.keywords = list(set(keywords + other_keywords))
    article.images = article.images or newspaper_article.images
    article.summary = article.summary or newspaper_article.summary
    article.meta_favicon = article.meta_favicon or newspaper_article.meta_favicon
    article.meta_lang = article.meta_lang or newspaper_article.meta_lang
    article.pub_date = article.pub_date or newspaper_article.published_date
Example #8
0
def _parse_newspaper(article, doc):
    newspaper_article = newspaper.build_article(article.url)
    newspaper_article.set_html(article.html)
    newspaper_article.parse()
    article.text = good(article.text) or newspaper_article.text
    article.title = good(article.title) or newspaper_article.title
    article.authors = good(article.authors) or newspaper_article.authors
    if not good(article.keywords):
        keywords = newspaper_article.keywords or []
        other_keywords = newspaper_article.meta_keywords or []
        article.keywords = list(set(keywords + other_keywords))
    article.images = good(article.images) or list(newspaper_article.images)
    article.summary = good(article.summary) or newspaper_article.summary
    article.summary = good(article.summary) or nltk.sent_tokenize(newspaper_article.text)[0]
    article.meta_favicon = good(article.meta_favicon) or newspaper_article.meta_favicon
    article.meta_lang = good(article.meta_lang) or newspaper_article.meta_lang
    article.pub_date = good(article.pub_date) or newspaper_article.publish_date
Example #9
0
def _parse_newspaper(article, doc):
    newspaper_article = newspaper.build_article(article.url)
    newspaper_article.set_html(article.html)
    newspaper_article.parse()
    article.text = good(article.text) or newspaper_article.text
    article.title = good(article.title) or newspaper_article.title
    article.authors = good(article.authors) or newspaper_article.authors
    if not good(article.keywords):
        keywords = newspaper_article.keywords or []
        other_keywords = newspaper_article.meta_keywords or []
        article.keywords = list(set(keywords + other_keywords))
    article.images = good(article.images) or list(newspaper_article.images)
    article.summary = good(article.summary) or newspaper_article.summary
    if not good(article.summary):
        other_summary = nltk.sent_tokenize(newspaper_article.text)
        if len(other_summary) > 0:
            article.summary = other_summary[0]
    article.meta_favicon = good(article.meta_favicon) or newspaper_article.meta_favicon
    article.meta_lang = good(article.meta_lang) or newspaper_article.meta_lang
    article.pub_date = good(article.pub_date) or newspaper_article.publish_date
Example #10
0
def get_article(url, lang):
    """Get text of article from url and language"""
    a = newspaper.build_article(url, lang=lang)
    a.download()
    a.parse()
    return a
Example #11
0
    def download_and_parse(self):
        article = newspaper.build_article(self.url)
        article.download()
        article.parse()
        self.text = article.text
        self.title = article.title
        self.download_date = time.localtime()

        # Optional parameters
        if article.authors:
            self.authors = article.authors
        else:
            self.authors = None

        self.source_domain = None

        # Not sure how to implement right now
        self.category = None

        # Merge keywords and meta_keywords into the same param
        self.keywords = []
        if article.keywords:
            self.keywords = article.keywords
        if article.meta_keywords:
            self.keywords = list(set(self.keywords + article.meta_keywords))

        if self.keywords == ['']:
            keywords = None

        if article.images:
            self.images = article.images
        else:
            self.images = None

        # Will implement later
        self.location = None

        if article.summary:
            self.summary = article.summary
        else:
            self.summary = None

        # Not sure how to implement
        self.suggested_articles = None

        if article.meta_favicon:
            self.meta_favicon = article.meta_favicon
        else:
            self.meta_favicon = None

        if article.meta_lang:
            self.meta_lang = article.meta_lang
        else:
            self.meta_lang = None

        if article.published_date:
            self.pub_date = article.published_date
        else:
            self.pub_date = None

        if self._parser:
            self._parser(self)

        self.parsed = True
Example #12
0
    def download_and_parse(self):
        article = newspaper.build_article(self.url)
        article.download()
        article.parse()
        self.text = article.text
        self.title = article.title
        self.download_date = time.localtime()

        # Optional parameters
        if article.authors:
            self.authors = article.authors
        else:
            self.authors = None

        self.source_domain = None

        # Not sure how to implement right now
        self.category = None

        # Merge keywords and meta_keywords into the same param
        self.keywords = []
        if article.keywords:
            self.keywords = article.keywords
        if article.meta_keywords:
            self.keywords = list(set(self.keywords + article.meta_keywords))

        if self.keywords == ['']:
            keywords = None

        if article.images:
            self.images = article.images
        else:
            self.images = None

        # Will implement later
        self.location = None

        if article.summary:
            self.summary = article.summary
        else:
            self.summary = None

        # Not sure how to implement
        self.suggested_articles = None

        if article.meta_favicon:
            self.meta_favicon = article.meta_favicon
        else:
            self.meta_favicon = None

        if article.meta_lang:
            self.meta_lang = article.meta_lang
        else:
            self.meta_lang = None

        if article.published_date:
            self.pub_date = article.published_date
        else:
            self.pub_date = None

        if self._parser:
            self._parser(self)

        self.parsed = True
Example #13
0
    def get(self):
        args = article_parser.parse_args()
        url = args['url']
        output_format = args['format']
        article = newspaper.build_article(url, config)
        article.download()
        article.parse()
        article.nlp()

        markdownify = bool(args['markdownify'])
        text = article.text
        if markdownify:
            h = HTML2Text(baseurl=args['url'])
            h.body_width = 0
            text = h.handle(article.article_html)

        data = {
            'url': article.url,
            'title': article.title,
            'top_image': article.top_img,
            'images': [x for x in article.imgs],
            'text': text,
            'html': article.article_html,
            'keywords': article.keywords,
            'authors': article.authors,
            'summary': article.summary,
            'meta_description': article.meta_description,
            'meta_lang': article.meta_lang,
            'meta_favicon': article.meta_favicon,
            'meta_keywords': article.meta_keywords,
            'canonical_link': article.canonical_link,
            'tags': [unicode(x) for x in article.tags],
            'movies': article.movies,
            'additional_data': article.additional_data,
        }

        if output_format == 'json':
            return output_json(data, 200, {})

        if output_format == 'text':
            output = u'---\n'
            output += u'link: %s\n' % (article.url)
            output += u'title: %s\n' % (article.title)
            output += u'authors: %s\n' % (u', '.join(article.authors))
            output += u'keywords: %s\n' % (u', '.join(article.keywords))
            output += u'---\n\n'
            if args['include_summary']:
                output += u'# Summary\n\n%s\n' % (article.summary)

            output += text

            r = args.get('redirect')
            if r and r in ['nvalt', 'notsey']:
                title = u'%s - %s' % (article.title, u', '.join(article.authors))
                title = title.encode('utf-8')
                output = output.encode('utf-8')

                if r == 'nvalt':
                    opts = {
                        'txt': output,
                        'title': title,
                    }
                    opts = '&'.join(['%s=%s' % (key, quote(val)) for key, val in opts.items()])
                    url = 'nvalt://make/?' + opts

                if r == 'notsey':
                    opts = {
                        'text': output,
                        'name': title,
                    }
                    opts = '&'.join(['%s=%s' % (key, quote(val)) for key, val in opts.items()])
                    url = 'notesy://x-callback-url/append?' + opts

                return make_response(redirect(url))

            return output_text(output, 200, {'Content-Type': 'text'})
Example #14
0
    def get(self):
        args = article_parser.parse_args()
        url = args['url']
        output_format = args['format']
        article = newspaper.build_article(url, config)
        article.download()
        if article.html == "":
            abort(404)
        else:
            article.parse()
            if isinstance(article.title, unicode):
                print "Rencoding article title"
                article.set_title(article.title.encode('ascii', 'ignore'))

            if isinstance(article.text, unicode):
                print "Rencoding article text"
                article.set_text(article.text.encode('ascii', 'ignore'))

            article.nlp()

            markdownify = bool(args['markdownify'])
            text = article.text
            if markdownify:
                h = HTML2Text(baseurl=args['url'])
                h.body_width = 0
                text = h.handle(article.article_html)

            data = {
                'url': article.url,
                'title': article.title,
                'top_image': article.top_img,
                # 'images': [x for x in article.imgs],
                'text': text,
                # 'html': article.article_html,
                'keywords': article.keywords,
                'authors': article.authors,
                'summary': article.summary,
                'meta_description': article.meta_description,
                'meta_lang': article.meta_lang,
                # 'meta_favicon': article.meta_favicon,
                'meta_keywords': article.meta_keywords,
                'canonical_link': article.canonical_link,
                'tags': [unicode(x) for x in article.tags],
                'movies': article.movies,
                'additional_data': article.additional_data,
            }

            if args['og_image']:
                og = article.doc.xpath("//meta[@property='og:image']/@content")
                if len(og) > 0:
                    data['opengraph_image'] = og[0]

            if output_format == 'json':
                return output_json(data, 200, { 'Content-Type' : 'application/json' })

            if output_format == 'text':
                output = u'---\n'
                output += u'link: %s\n' % (article.url)
                output += u'title: %s\n' % (article.title)
                output += u'authors: %s\n' % (u', '.join(article.authors))
                output += u'keywords: %s\n' % (u', '.join(article.keywords))
                output += u'---\n\n'
                if args['include_summary']:
                    output += u'# Summary\n\n%s\n' % (article.summary)

                output += text

                r = args.get('redirect')
                if r and r in ['nvalt', 'notsey']:
                    title = u'%s - %s' % (article.title, u', '.join(article.authors))
                    title = title.encode('utf-8')
                    output = output.encode('utf-8')

                    if r == 'nvalt':
                        opts = {
                            'txt': output,
                            'title': title,
                        }
                        opts = '&'.join(['%s=%s' % (key, quote(val)) for key, val in opts.items()])
                        url = 'nvalt://make/?' + opts

                    if r == 'notsey':
                        opts = {
                            'text': output,
                            'name': title,
                        }
                        opts = '&'.join(['%s=%s' % (key, quote(val)) for key, val in opts.items()])
                        url = 'notesy://x-callback-url/append?' + opts

                    return make_response(redirect(url))

                return output_text(output, 200, {'Content-Type': 'text'})