def test_article_build(self): url = "http://abcnews.go.com/blogs/politics/2013/12/states-cite-surge-in-obamacare-sign-ups-ahead-of-first-deadline/" article = newspaper.build_article(url) assert isinstance(article, Article) == True article.download() article.parse() article.nlp()
def article_extractor(newspaper_url, title_topic=None): """ This takes a root url and pulls all articles the newspapers package can access, without caching. :param newspaper_url: url of homepage :param title_topic: for filtering by topic in title, if None get all :return: dataframe with article title and text, nothing else TODO: parse extra fields and include in data frame, curr issue is this breaks pickle serialization """ dd = defaultdict(list) source = newspaper.build(newspaper_url, memoize_articles=False) arts = [i.url for i in source.articles] print("Articles: " + str(len(arts))) if title_topic is None: relevant_arts = [i for i in arts] else: relevant_arts = [i for i in arts if title_topic in i] for i in relevant_arts: art = newspaper.build_article(i) try: art.download() art.parse() #for key in vars(art).keys(): # if key == 'config' or key == 'extractor' or key == 'top_node' or key == 'clean_top_node': # continue # dd[key].append(vars(art)[key]) dd['title'].append(art.title) dd['text'].append(art.text) except newspaper.article.ArticleException: continue return pd.DataFrame.from_dict(dd)
def get(self): args = article_parser.parse_args() url = args['url'] article = newspaper.build_article(url, config) article.download() article.parse() article.nlp() return { 'url': article.url, 'title': article.title, 'top_image': article.top_img, 'images': [x for x in article.imgs], 'text': article.text, 'html': article.article_html, 'keywords': article.keywords, 'authors': article.authors, 'summary': article.summary, 'meta_description': article.meta_description, 'meta_lang': article.meta_lang, 'meta_favicon': article.meta_favicon, 'meta_keywords': article.meta_keywords, 'canonical_link': article.canonical_link, 'tags': [unicode(x) for x in article.tags], 'movies': article.movies, 'additional_data': article.additional_data, }
def test_article_build(self): url = 'http://abcnews.go.com/blogs/politics/2013/12/states-cite-surge-in-obamacare-sign-ups-ahead-of-first-deadline/' article = newspaper.build_article(url) assert isinstance(article, Article) == True article.download() article.parse() article.nlp()
def test_article_build(self): url = ('http://abcnews.go.com/blogs/politics/2013/12/' 'states-cite-surge-in-obamacare-sign-ups-ahead' '-of-first-deadline/') article = newspaper.build_article(url) assert isinstance(article, Article) is True article.build() article.nlp()
def _parse_newspaper(article): newspaper_article = newspaper.build_article(article.url) newspaper_article.set_html(article.html) newspaper_article.parse() article.text = article.text or newspaper_article.text article.title = article.title or newspaper_article.title article.authors = article.authors or newspaper_article.authors if not article.keywords: keywords = newspaper_article.keywords or [] other_keywords = newspaper_article.meta_keywords or [] article.keywords = list(set(keywords + other_keywords)) article.images = article.images or newspaper_article.images article.summary = article.summary or newspaper_article.summary article.meta_favicon = article.meta_favicon or newspaper_article.meta_favicon article.meta_lang = article.meta_lang or newspaper_article.meta_lang article.pub_date = article.pub_date or newspaper_article.published_date
def _parse_newspaper(article, doc): newspaper_article = newspaper.build_article(article.url) newspaper_article.set_html(article.html) newspaper_article.parse() article.text = good(article.text) or newspaper_article.text article.title = good(article.title) or newspaper_article.title article.authors = good(article.authors) or newspaper_article.authors if not good(article.keywords): keywords = newspaper_article.keywords or [] other_keywords = newspaper_article.meta_keywords or [] article.keywords = list(set(keywords + other_keywords)) article.images = good(article.images) or list(newspaper_article.images) article.summary = good(article.summary) or newspaper_article.summary article.summary = good(article.summary) or nltk.sent_tokenize(newspaper_article.text)[0] article.meta_favicon = good(article.meta_favicon) or newspaper_article.meta_favicon article.meta_lang = good(article.meta_lang) or newspaper_article.meta_lang article.pub_date = good(article.pub_date) or newspaper_article.publish_date
def _parse_newspaper(article, doc): newspaper_article = newspaper.build_article(article.url) newspaper_article.set_html(article.html) newspaper_article.parse() article.text = good(article.text) or newspaper_article.text article.title = good(article.title) or newspaper_article.title article.authors = good(article.authors) or newspaper_article.authors if not good(article.keywords): keywords = newspaper_article.keywords or [] other_keywords = newspaper_article.meta_keywords or [] article.keywords = list(set(keywords + other_keywords)) article.images = good(article.images) or list(newspaper_article.images) article.summary = good(article.summary) or newspaper_article.summary if not good(article.summary): other_summary = nltk.sent_tokenize(newspaper_article.text) if len(other_summary) > 0: article.summary = other_summary[0] article.meta_favicon = good(article.meta_favicon) or newspaper_article.meta_favicon article.meta_lang = good(article.meta_lang) or newspaper_article.meta_lang article.pub_date = good(article.pub_date) or newspaper_article.publish_date
def get_article(url, lang): """Get text of article from url and language""" a = newspaper.build_article(url, lang=lang) a.download() a.parse() return a
def download_and_parse(self): article = newspaper.build_article(self.url) article.download() article.parse() self.text = article.text self.title = article.title self.download_date = time.localtime() # Optional parameters if article.authors: self.authors = article.authors else: self.authors = None self.source_domain = None # Not sure how to implement right now self.category = None # Merge keywords and meta_keywords into the same param self.keywords = [] if article.keywords: self.keywords = article.keywords if article.meta_keywords: self.keywords = list(set(self.keywords + article.meta_keywords)) if self.keywords == ['']: keywords = None if article.images: self.images = article.images else: self.images = None # Will implement later self.location = None if article.summary: self.summary = article.summary else: self.summary = None # Not sure how to implement self.suggested_articles = None if article.meta_favicon: self.meta_favicon = article.meta_favicon else: self.meta_favicon = None if article.meta_lang: self.meta_lang = article.meta_lang else: self.meta_lang = None if article.published_date: self.pub_date = article.published_date else: self.pub_date = None if self._parser: self._parser(self) self.parsed = True
def get(self): args = article_parser.parse_args() url = args['url'] output_format = args['format'] article = newspaper.build_article(url, config) article.download() article.parse() article.nlp() markdownify = bool(args['markdownify']) text = article.text if markdownify: h = HTML2Text(baseurl=args['url']) h.body_width = 0 text = h.handle(article.article_html) data = { 'url': article.url, 'title': article.title, 'top_image': article.top_img, 'images': [x for x in article.imgs], 'text': text, 'html': article.article_html, 'keywords': article.keywords, 'authors': article.authors, 'summary': article.summary, 'meta_description': article.meta_description, 'meta_lang': article.meta_lang, 'meta_favicon': article.meta_favicon, 'meta_keywords': article.meta_keywords, 'canonical_link': article.canonical_link, 'tags': [unicode(x) for x in article.tags], 'movies': article.movies, 'additional_data': article.additional_data, } if output_format == 'json': return output_json(data, 200, {}) if output_format == 'text': output = u'---\n' output += u'link: %s\n' % (article.url) output += u'title: %s\n' % (article.title) output += u'authors: %s\n' % (u', '.join(article.authors)) output += u'keywords: %s\n' % (u', '.join(article.keywords)) output += u'---\n\n' if args['include_summary']: output += u'# Summary\n\n%s\n' % (article.summary) output += text r = args.get('redirect') if r and r in ['nvalt', 'notsey']: title = u'%s - %s' % (article.title, u', '.join(article.authors)) title = title.encode('utf-8') output = output.encode('utf-8') if r == 'nvalt': opts = { 'txt': output, 'title': title, } opts = '&'.join(['%s=%s' % (key, quote(val)) for key, val in opts.items()]) url = 'nvalt://make/?' + opts if r == 'notsey': opts = { 'text': output, 'name': title, } opts = '&'.join(['%s=%s' % (key, quote(val)) for key, val in opts.items()]) url = 'notesy://x-callback-url/append?' + opts return make_response(redirect(url)) return output_text(output, 200, {'Content-Type': 'text'})
def get(self): args = article_parser.parse_args() url = args['url'] output_format = args['format'] article = newspaper.build_article(url, config) article.download() if article.html == "": abort(404) else: article.parse() if isinstance(article.title, unicode): print "Rencoding article title" article.set_title(article.title.encode('ascii', 'ignore')) if isinstance(article.text, unicode): print "Rencoding article text" article.set_text(article.text.encode('ascii', 'ignore')) article.nlp() markdownify = bool(args['markdownify']) text = article.text if markdownify: h = HTML2Text(baseurl=args['url']) h.body_width = 0 text = h.handle(article.article_html) data = { 'url': article.url, 'title': article.title, 'top_image': article.top_img, # 'images': [x for x in article.imgs], 'text': text, # 'html': article.article_html, 'keywords': article.keywords, 'authors': article.authors, 'summary': article.summary, 'meta_description': article.meta_description, 'meta_lang': article.meta_lang, # 'meta_favicon': article.meta_favicon, 'meta_keywords': article.meta_keywords, 'canonical_link': article.canonical_link, 'tags': [unicode(x) for x in article.tags], 'movies': article.movies, 'additional_data': article.additional_data, } if args['og_image']: og = article.doc.xpath("//meta[@property='og:image']/@content") if len(og) > 0: data['opengraph_image'] = og[0] if output_format == 'json': return output_json(data, 200, { 'Content-Type' : 'application/json' }) if output_format == 'text': output = u'---\n' output += u'link: %s\n' % (article.url) output += u'title: %s\n' % (article.title) output += u'authors: %s\n' % (u', '.join(article.authors)) output += u'keywords: %s\n' % (u', '.join(article.keywords)) output += u'---\n\n' if args['include_summary']: output += u'# Summary\n\n%s\n' % (article.summary) output += text r = args.get('redirect') if r and r in ['nvalt', 'notsey']: title = u'%s - %s' % (article.title, u', '.join(article.authors)) title = title.encode('utf-8') output = output.encode('utf-8') if r == 'nvalt': opts = { 'txt': output, 'title': title, } opts = '&'.join(['%s=%s' % (key, quote(val)) for key, val in opts.items()]) url = 'nvalt://make/?' + opts if r == 'notsey': opts = { 'text': output, 'name': title, } opts = '&'.join(['%s=%s' % (key, quote(val)) for key, val in opts.items()]) url = 'notesy://x-callback-url/append?' + opts return make_response(redirect(url)) return output_text(output, 200, {'Content-Type': 'text'})