def download_article(url):
    """ Download the html content of a news page

    :param url: news page's url
    :type url: string
    :return: news page's content
    :rtype: requests.models.Response
    """

    article = { 'link': url, 'source': 'crawler_estadao' }
    logger.info("Downloading article: {0}".format(url))

    try:
        response = requests.get(url, timeout=30)
    except Exception as ex:
        logger.exception("Failed to fetch {0}".format(url))
        return None

    extractor = Goose({'use_meta_language': False, 'target_language':'pt'})
    news = extractor.extract(url=url)
    soup = BeautifulSoup(response.text)

    article['link_content'] = compress_content(response.text)
    article['compressed'] = True
    article['language'] = detect_language(response.text)
    article['title'] = extract_title(news)
    article['body_content'] = extract_content(news)
    article['published_time'] = extract_published_time(url, soup)

    return article
Ejemplo n.º 2
0
def download_article(url):
    """ Download the html content of a news page

    :param url: news page's url
    :type url: string
    :return: news page's content
    :rtype: requests.models.Response
    """

    article = { 'link': url, 'source': 'crawler_estadao' }
    logger.info("Downloading article: {0}".format(url))

    try:
        response = requests.get(url, timeout=30)
    except Exception as ex:
        logger.exception("Failed to fetch {0}".format(url))
        return None

    extractor = Goose({'use_meta_language': False, 'target_language':'pt'})
    news = extractor.extract(url=url)
    soup = BeautifulSoup(response.text)

    article['link_content'] = compress_content(response.text)
    article['compressed'] = True
    article['language'] = detect_language(response.text)
    article['title'] = extract_title(news)
    article['body_content'] = extract_content(news)
    article['published_time'] = extract_published_time(url, soup)

    return article
Ejemplo n.º 3
0
def download_article(url):
    article = {'link': url, 'source': 'crawler_oglobo'}
    logger.info("Downloading article: {0}".format(url))

    try:
        response = requests.get(url, timeout=30)
    except Exception as ex:
        logger.exception("Failed to fetch {0}. Exception: {1}".format(url, ex))
        return None

    extractor = Goose({'use_meta_language': False, 'target_language': 'pt'})
    news = extractor.extract(url=url)
    soup = BeautifulSoup(response.text)

    article['link_content'] = compress_content(response.text)
    article['compressed'] = True
    article['language'] = detect_language(response.text)
    article['title'] = extract_title(news)
    article['published_time'] = extract_published_time(soup)
    article['body_content'] = extract_content(news)

    return article
Ejemplo n.º 4
0
def download_article(url):
    article = { 'link': url, 'source': 'crawler_oglobo'}
    logger.info("Downloading article: {0}".format(url))

    try:
        response = requests.get(url, timeout=30)
    except Exception as ex:
        logger.exception("Failed to fetch {0}. Exception: {1}".format(url, ex))
        return None

    extractor = Goose({'use_meta_language': False, 'target_language':'pt'})
    news = extractor.extract(url=url)
    soup = BeautifulSoup(response.text)

    article['link_content'] = compress_content(response.text)
    article['compressed'] = True
    article['language'] = detect_language(response.text)
    article['title'] = extract_title(news)
    article['published_time'] = extract_published_time(soup)
    article['body_content'] = extract_content(news)

    return article
def download_article(url):
    """ Download the html content of a news page

    :param url: news page's url
    :type url: string
    :return: news page's content
    :rtype: requests.models.Response
    """

    article = {"link": url, "source": "crawler_folha_sao_paulo"}
    logger.info("Downloading article: {0}".format(url))

    try:
        response = requests.get(url, timeout=30)
    except Exception as ex:
        logger.exception("Failed to fetch {0}".format(url))
        return None

    extractor = Goose({"use_meta_language": False, "target_language": "pt"})
    news = extractor.extract(url=url)
    soup = BeautifulSoup(response.content)

    article["link_content"] = compress_content(response.text)
    article["compressed"] = True
    article["language"] = detect_language(response.text)
    article["title"] = extract_title(news)
    article["category"] = extract_category(url)
    article["published_time"] = extract_published_time(soup)

    content = extract_content(news, soup)

    if len(content) is 2:
        article["link"], article["body_content"] = content
    else:
        article["body_content"] = content

    return article