def get_search_result(soup): search_result_class_tag = "search-results" headlines_class_tag = "headline" footer_date_tag = "flags btm" date_class_tag = "display-date" search_results = soup.find("ol", { "class": search_result_class_tag }).find_all("li") articles = list() for result in search_results: news_article = NewsArticle() result = result.find("div") result_headline = result.find("h1", {"itemprop": headlines_class_tag}) # date under tags: footer -> dl -> dd -> time result_date = result.find("footer").find("dl", { "class": footer_date_tag }).find("dd").find("time", {"class": date_class_tag}) news_article.title = result_headline.find("a").string.strip() news_article.url = result_headline.find("a")['href'] #TODO: put date in correct format news_article.date = result_date.string.strip() news_article.source = "BBC" articles.append(news_article) return articles
def retrieve_homepage_articles(soup): homepage_headline_class_tag = "storylink" headlines = soup.find_all("a", {"class": homepage_headline_class_tag}) articles = list() for result in headlines: news_article = NewsArticle() news_article.title = result.string.strip() news_article.url = result['href'] news_article.source = "HackerNews" articles.append(news_article) return articles
def retrieve_homepage_articles(soup): homepage_headline_class_tag = "block-link__overlay-link" headlines = soup.find_all("a", {"class": homepage_headline_class_tag}) articles = list() for i in range(20): result = headlines[i] news_article = NewsArticle() news_article.title = result.string.strip() news_article.url = result['href'] news_article.source = "BBC" articles.append(news_article) return articles
def convert_to_class(item): news_article = NewsArticle() news_article.authors = item['authors'] news_article.date_download = ExtractedInformationStorage.datestring_to_date(item['date_download']) news_article.date_modify = ExtractedInformationStorage.datestring_to_date(item['date_modify']) news_article.date_publish = ExtractedInformationStorage.datestring_to_date(item['date_publish']) news_article.description = item['description'] news_article.filename = item['filename'] news_article.image_url = item['image_url'] news_article.language = item['language'] news_article.localpath = item['localpath'] news_article.title = item['title'] news_article.title_page = item['title_page'] news_article.title_rss = item['title_rss'] news_article.source_domain = item['source_domain'] news_article.text = item['text'] news_article.url = item['url'] return news_article
def convert_to_class(item): news_article = NewsArticle() news_article.authors = item['authors'] news_article.date_download = ExtractedInformationStorage.datestring_to_date(item['date_download']) news_article.date_modify = ExtractedInformationStorage.datestring_to_date(item['date_modify']) news_article.date_publish = ExtractedInformationStorage.datestring_to_date(item['date_publish']) news_article.description = item['description'] news_article.filename = item['filename'] news_article.image_url = item['image_url'] news_article.language = item['language'] news_article.localpath = item['localpath'] news_article.title = item['title'] news_article.title_page = item['title_page'] news_article.title_rss = item['title_rss'] news_article.source_domain = item['source_domain'] news_article.text = item['text'] news_article.url = item['url'] return news_article
def get_headlines(num_headlines=None, browser=None): url = "https://www.cnn.com/" soup = get_url_soup(url, browser=browser) counter = 0 headlines = list() urls = list() for h3_soup in soup.find_all("h3", {"class": "cd__headline"}): counter += 1 headline = h3_soup.find("span", { "class": "cd__headline-text" }).get_text() article = NewsArticle() article.title = headline print(article.title) headlines.append(article) url = h3_soup.find("a")["href"] if "https://www.cnn.com" not in url: url = "https://www.cnn.com" + url urls.append(url) if num_headlines is not None and counter >= num_headlines: break return headlines