Exemple #1
0
    def scrapeArticle(self, url):
        article = Article(url)
        article.download()
        article.parse()

        if not article.authors == []:
            article.authors = article.authors[0]
        else:
            article.authors = ""

        return {
            "title": article.title[:500],
            "content": article.text,
            "author": article.authors
        }
Exemple #2
0
def test_convert_to_dict_most_fields_works():
    faker = Faker()

    source = Article(url=faker.url())

    source.authors = [faker.name(), faker.name()]
    source.top_image = faker.image_url()
    source.article_html = faker.text()
    source.images = [faker.image_url(), faker.image_url()]
    source.meta_data = [faker.city(), faker.state(), faker.country()]

    result = extractor.to_dict(source, "article_html", "authors", "images",
                               "keywords", "meta_data", "source_url",
                               "summary", "top_image", "url", "tags",
                               "meta_favicon")

    assert result
    assert len(result) == 7

    assert "article_html" in result
    assert "authors" in result
    assert "images" in result
    assert "keywords" not in result
    assert "meta_data" in result
    assert "source_url" in result
    assert "summary" not in result
    assert "top_image" in result
    assert "url" in result
    assert "tags" not in result
    assert "meta_favicon" not in result
def process_html(url, html):
    log(f'Processing {url}')
    article = Article(url, KEYWORD_COUNT=25)
    article.download(input_html=html)
    article.parse()
    article.authors = '; '.join(article.authors)
    log(f'Parsed {len(article.text)} bytes of natural text')
    article.nlp()
    keywords = copy.deepcopy(article.keywords)
    article.keywords = ', '.join(keywords)
    return article, keywords
Exemple #4
0
    def get(self, request):
        url = request.GET.get('url')
        article = Article(url)
        article.download()
        article.parse()

        if not article.authors == []:
            article.authors = article.authors[0]
        else:
            article.authors = ""

        data = {
            "url": url,
            "title": article.title,
            "content": article.text,
            "author": article.authors,
            "date": article.publish_date
        }

        results = ScrapedArticleSerializer(data, many=False).data
        return Response(results)
Exemple #5
0
def parse_article(url):
  '''
  Responsible for parsing a single article.
  '''
  article = Article(url)

  print("Download data of URL: {}".format(url))

  article.download()

  # Fallback, otherwise the program would exit on the first invalid URL
  try:
    article.parse()
  except newspaper.article.ArticleException:
    print("Oops! The URL '{}' seems inaccessible!".format(url))

    article.authors = ['<UNK>']
    article.text = '<UNK>'

    return article

  return article