Python remove_duplicate_entries Exemples, newscrape_common.remove_duplicate_entries Python Exemples

Exemple #1

0

Afficher le fichier

def get_trending_headlines(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        for tag in soup.find_all("span", {"class": "video_icon_ss"}):
            tag.parent.parent.decompose()
        a_tags = soup.find("div", id="left").find("div", {
            "class": "flex-box"
        }).find_all("a")
        headlines = remove_duplicate_entries(map(get_headline_details, a_tags),
                                             "link")
        return headlines
    return None

Exemple #2

0

Afficher le fichier

def get_trending_headlines(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        soup.find("div", {"class": "opinion_opt"}).decompose()
        # Some anchor tags in div[class="lhs_col_two"] are not parsed by the following
        a_tags = soup.find("div", {
            "class": "hmpage_lhs"
        }).find_all("a", {"class": "item-title"})
        headlines = remove_duplicate_entries(map(get_headline_details, a_tags),
                                             "link")
        return headlines
    return None

Exemple #3

0

Afficher le fichier

Fichier : hindustan-times.py Projet : RomitKumar/Newscrape

def get_trending_headlines(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        soup.find("div", {"class": "latestnews-left"}).decompose()
        soup.find("div", {"class": "advertisement-250"}).decompose()
        # to remove sponsered content
        # not sure if tag works every time
        soup.find("div", {"class": "top-thumb mt-20"}).decompose()
        a_tags = soup.find("div", {
            "class": "news-area newtop-block mb-5 mt-10"
        }).find_all("a")
        headlines = remove_duplicate_entries(map(get_headline_details, a_tags),
                                             "link", "title")
        return headlines
    return None