def download_article(url): """ Download the html content of a news page :param url: news page's url :type url: string :return: news page's content :rtype: requests.models.Response """ article = { 'link': url, 'source': 'crawler_estadao' } logger.info("Downloading article: {0}".format(url)) try: response = requests.get(url, timeout=30) except Exception as ex: logger.exception("Failed to fetch {0}".format(url)) return None extractor = Goose({'use_meta_language': False, 'target_language':'pt'}) news = extractor.extract(url=url) soup = BeautifulSoup(response.text) article['link_content'] = compress_content(response.text) article['compressed'] = True article['language'] = detect_language(response.text) article['title'] = extract_title(news) article['body_content'] = extract_content(news) article['published_time'] = extract_published_time(url, soup) return article
def download_article(url): article = {'link': url, 'source': 'crawler_oglobo'} logger.info("Downloading article: {0}".format(url)) try: response = requests.get(url, timeout=30) except Exception as ex: logger.exception("Failed to fetch {0}. Exception: {1}".format(url, ex)) return None extractor = Goose({'use_meta_language': False, 'target_language': 'pt'}) news = extractor.extract(url=url) soup = BeautifulSoup(response.text) article['link_content'] = compress_content(response.text) article['compressed'] = True article['language'] = detect_language(response.text) article['title'] = extract_title(news) article['published_time'] = extract_published_time(soup) article['body_content'] = extract_content(news) return article
def download_article(url): article = { 'link': url, 'source': 'crawler_oglobo'} logger.info("Downloading article: {0}".format(url)) try: response = requests.get(url, timeout=30) except Exception as ex: logger.exception("Failed to fetch {0}. Exception: {1}".format(url, ex)) return None extractor = Goose({'use_meta_language': False, 'target_language':'pt'}) news = extractor.extract(url=url) soup = BeautifulSoup(response.text) article['link_content'] = compress_content(response.text) article['compressed'] = True article['language'] = detect_language(response.text) article['title'] = extract_title(news) article['published_time'] = extract_published_time(soup) article['body_content'] = extract_content(news) return article
def download_article(url): """ Download the html content of a news page :param url: news page's url :type url: string :return: news page's content :rtype: requests.models.Response """ article = {"link": url, "source": "crawler_folha_sao_paulo"} logger.info("Downloading article: {0}".format(url)) try: response = requests.get(url, timeout=30) except Exception as ex: logger.exception("Failed to fetch {0}".format(url)) return None extractor = Goose({"use_meta_language": False, "target_language": "pt"}) news = extractor.extract(url=url) soup = BeautifulSoup(response.content) article["link_content"] = compress_content(response.text) article["compressed"] = True article["language"] = detect_language(response.text) article["title"] = extract_title(news) article["category"] = extract_category(url) article["published_time"] = extract_published_time(soup) content = extract_content(news, soup) if len(content) is 2: article["link"], article["body_content"] = content else: article["body_content"] = content return article