예제 #1
0
def get_search_result(soup):
    search_result_class_tag = "search-results"
    headlines_class_tag = "headline"
    footer_date_tag = "flags btm"
    date_class_tag = "display-date"

    search_results = soup.find("ol", {
        "class": search_result_class_tag
    }).find_all("li")

    articles = list()

    for result in search_results:
        news_article = NewsArticle()

        result = result.find("div")
        result_headline = result.find("h1", {"itemprop": headlines_class_tag})
        # date under tags: footer -> dl -> dd -> time
        result_date = result.find("footer").find("dl", {
            "class": footer_date_tag
        }).find("dd").find("time", {"class": date_class_tag})

        news_article.title = result_headline.find("a").string.strip()
        news_article.url = result_headline.find("a")['href']
        #TODO: put date in correct format
        news_article.date = result_date.string.strip()
        news_article.source = "BBC"

        articles.append(news_article)

    return articles
예제 #2
0
def retrieve_homepage_articles(soup):
    homepage_headline_class_tag = "storylink"

    headlines = soup.find_all("a", {"class": homepage_headline_class_tag})

    articles = list()

    for result in headlines:
        news_article = NewsArticle()

        news_article.title = result.string.strip()
        news_article.url = result['href']
        news_article.source = "HackerNews"
        articles.append(news_article)
    return articles
예제 #3
0
def retrieve_homepage_articles(soup):
    homepage_headline_class_tag = "block-link__overlay-link"

    headlines = soup.find_all("a", {"class": homepage_headline_class_tag})

    articles = list()

    for i in range(20):
        result = headlines[i]
        news_article = NewsArticle()

        news_article.title = result.string.strip()
        news_article.url = result['href']
        news_article.source = "BBC"
        articles.append(news_article)
    return articles
 def convert_to_class(item):
     news_article = NewsArticle()
     news_article.authors = item['authors']
     news_article.date_download = ExtractedInformationStorage.datestring_to_date(item['date_download'])
     news_article.date_modify = ExtractedInformationStorage.datestring_to_date(item['date_modify'])
     news_article.date_publish = ExtractedInformationStorage.datestring_to_date(item['date_publish'])
     news_article.description = item['description']
     news_article.filename = item['filename']
     news_article.image_url = item['image_url']
     news_article.language = item['language']
     news_article.localpath = item['localpath']
     news_article.title = item['title']
     news_article.title_page = item['title_page']
     news_article.title_rss = item['title_rss']
     news_article.source_domain = item['source_domain']
     news_article.text = item['text']
     news_article.url = item['url']
     return news_article
 def convert_to_class(item):
     news_article = NewsArticle()
     news_article.authors = item['authors']
     news_article.date_download = ExtractedInformationStorage.datestring_to_date(item['date_download'])
     news_article.date_modify = ExtractedInformationStorage.datestring_to_date(item['date_modify'])
     news_article.date_publish = ExtractedInformationStorage.datestring_to_date(item['date_publish'])
     news_article.description = item['description']
     news_article.filename = item['filename']
     news_article.image_url = item['image_url']
     news_article.language = item['language']
     news_article.localpath = item['localpath']
     news_article.title = item['title']
     news_article.title_page = item['title_page']
     news_article.title_rss = item['title_rss']
     news_article.source_domain = item['source_domain']
     news_article.text = item['text']
     news_article.url = item['url']
     return news_article
예제 #6
0
def get_headlines(num_headlines=None, browser=None):
    url = "https://www.cnn.com/"
    soup = get_url_soup(url, browser=browser)
    counter = 0
    headlines = list()
    urls = list()
    for h3_soup in soup.find_all("h3", {"class": "cd__headline"}):
        counter += 1
        headline = h3_soup.find("span", {
            "class": "cd__headline-text"
        }).get_text()

        article = NewsArticle()
        article.title = headline
        print(article.title)
        headlines.append(article)
        url = h3_soup.find("a")["href"]
        if "https://www.cnn.com" not in url:
            url = "https://www.cnn.com" + url
        urls.append(url)
        if num_headlines is not None and counter >= num_headlines:
            break

    return headlines