def scrapeArticle(self, url): article = Article(url) article.download() article.parse() if not article.authors == []: article.authors = article.authors[0] else: article.authors = "" return { "title": article.title[:500], "content": article.text, "author": article.authors }
def test_convert_to_dict_most_fields_works(): faker = Faker() source = Article(url=faker.url()) source.authors = [faker.name(), faker.name()] source.top_image = faker.image_url() source.article_html = faker.text() source.images = [faker.image_url(), faker.image_url()] source.meta_data = [faker.city(), faker.state(), faker.country()] result = extractor.to_dict(source, "article_html", "authors", "images", "keywords", "meta_data", "source_url", "summary", "top_image", "url", "tags", "meta_favicon") assert result assert len(result) == 7 assert "article_html" in result assert "authors" in result assert "images" in result assert "keywords" not in result assert "meta_data" in result assert "source_url" in result assert "summary" not in result assert "top_image" in result assert "url" in result assert "tags" not in result assert "meta_favicon" not in result
def process_html(url, html): log(f'Processing {url}') article = Article(url, KEYWORD_COUNT=25) article.download(input_html=html) article.parse() article.authors = '; '.join(article.authors) log(f'Parsed {len(article.text)} bytes of natural text') article.nlp() keywords = copy.deepcopy(article.keywords) article.keywords = ', '.join(keywords) return article, keywords
def get(self, request): url = request.GET.get('url') article = Article(url) article.download() article.parse() if not article.authors == []: article.authors = article.authors[0] else: article.authors = "" data = { "url": url, "title": article.title, "content": article.text, "author": article.authors, "date": article.publish_date } results = ScrapedArticleSerializer(data, many=False).data return Response(results)
def parse_article(url): ''' Responsible for parsing a single article. ''' article = Article(url) print("Download data of URL: {}".format(url)) article.download() # Fallback, otherwise the program would exit on the first invalid URL try: article.parse() except newspaper.article.ArticleException: print("Oops! The URL '{}' seems inaccessible!".format(url)) article.authors = ['<UNK>'] article.text = '<UNK>' return article return article