Esempio n. 1
0
def tvn24():
    t24Headlines=dict()
    tvn24="https://www.tvn24.pl/"
    page= urllib.request.urlopen(tvn24)
    soup = bs(page, features="html.parser")
    # print(soup.prettify())

    #main news:
    mainHeadlines=soup.find_all("h1")
    for headline in mainHeadlines:
        tags=headline.find_all("a")
        for tag in tags:
            # print("TVN24: " + str(tag.get_text()))
            key = randomStringDigits(8)
            value = tag.get_text()
            t24Headlines[key] = value

    #side news:
    sideHeadlines=soup.find_all("h2")
    for headline in sideHeadlines:
        if headline.has_attr('class'):
            aTag=headline.find("a")
            if aTag is not None:
                if headline['class'][0]=="decorate-heading" and len(str(aTag.string))>30:
                    # print("TVN24: " + str(aTag.string).strip())
                    key = randomStringDigits(8)
                    value = str(aTag.get_text()).strip()
                    t24Headlines[key] = value
    return t24Headlines
Esempio n. 2
0
def tvpinfo():
    tHeadlines = dict()
    tvp = "https://www.tvp.info/"
    page = urllib.request.urlopen(tvp)
    soup = bs(page, features="html.parser")

    # main news:
    mainHeadlines = soup.find_all("h1", class_="title")
    for headline in mainHeadlines:
        # print("TVP Info: " + str(headline.string).strip())
        key = randomStringDigits(8)
        value = str(headline.string).strip()
        tHeadlines[key] = value
    # major news:
    majorNewsHeadlines = soup.find_all("h2", class_="news__title")
    for headline in majorNewsHeadlines:
        # print("TVP Info: " + str(headline.string).strip())
        key = randomStringDigits(8)
        value = str(headline.string).strip()
        tHeadlines[key] = value

    # minor news:
    minorNewsHeadlines = soup.find_all("h3", class_="news__title")
    for headline in minorNewsHeadlines:
        # print("TVP Info: " + str(headline.string).strip())
        key = randomStringDigits(8)
        value = str(headline.string).strip()
        tHeadlines[key] = value

    # info:
    infoHeadlines = soup.find_all("h3", class_="information__text")
    for headline in infoHeadlines:
        # print("TVP Info: " + str(headline.string).strip())
        key = randomStringDigits(8)
        value = str(headline.string).strip()
        tHeadlines[key] = value

    # business:
    businessHeadlines = soup.find_all("h3", class_="business__subtitle")
    for headline in businessHeadlines:
        # print("TVP Info: " + str(headline.string).strip())
        key = randomStringDigits(8)
        value = str(headline.string).strip()
        tHeadlines[key] = value
    return tHeadlines
def wpolityce():
    wpolHeadlines=dict()
    wpolityce="https://wpolityce.pl"
    page= urllib.request.urlopen(wpolityce)
    soup = bs(page, features="html.parser")
    headlines=soup.find_all("span", class_="long-title")
    for headline in headlines:
        # print("WPolityce: " + str(headline.string))
        key=randomStringDigits(8)
        val=str(headline.string)
        wpolHeadlines[key]=val
    return wpolHeadlines
Esempio n. 4
0
def onet():
    oHeadlines = dict()
    onet = "https://wiadomosci.onet.pl/"
    page = urllib.request.urlopen(onet)
    soup = bs(page, features="html.parser")
    headlines = soup.find_all("h3")
    for headline in headlines:
        # print("Onet: " + str(headline.string))
        key = randomStringDigits(8)
        value = str(headline.string)
        oHeadlines[key] = value
    return oHeadlines
Esempio n. 5
0
def rmfswiat():
    rsHeadlines=dict()
    rmfWorld="https://www.rmf24.pl/fakty/swiat"
    page= urllib.request.urlopen(rmfWorld)
    soup = bs(page, features="html.parser")
    headlines=soup.find_all("div", class_="boxBody")
    for headline in headlines:
        imgs=headline.find_all("img")
        for img in imgs:
            key = randomStringDigits(8)
            value = str(img['alt']).strip().replace(u'\u200b','')
            rsHeadlines[key] = value
    return rsHeadlines
Esempio n. 6
0
def wp():
    wpHeadlines = dict()
    wp = "https://www.wp.pl/"
    page = urllib.request.urlopen(wp)
    soup = bs(page, features="html.parser")
    # print(soup.prettify())

    # news type 1:
    headlines = soup.find_all("div", class_="sc-1bp8799-1 gqsna")
    for headline in headlines:
        # print("WP: " + str(headline.get_text()))
        key = randomStringDigits(8)
        val = str(headline.get_text())
        wpHeadlines[key] = val
    # news type 2:
    headlines2 = soup.find_all("div", class_="lclzf3-0 egPcYF")
    for headline in headlines2:
        # print("WP: " + str(headline.get_text()))
        key = randomStringDigits(8)
        val = str(headline.get_text()).strip()
        wpHeadlines[key] = val.strip()
    return wpHeadlines
Esempio n. 7
0
def polityka():
    pHeadlines = dict()
    polityka = "https://www.polityka.pl/TygodnikPolityka"
    page = urllib.request.urlopen(polityka)
    soup = bs(page, features="html.parser")

    # side headlines
    sideHeadlines = soup.find_all("h3")
    for headline in sideHeadlines:
        if headline.string is not None:
            # print("Polityka: " + str(headline.string).strip())
            key = randomStringDigits(8)
            value = str(headline.string).strip()
            pHeadlines[key] = value
    return pHeadlines
def rmfpolska():
    rpHeadlines=dict()
    rmfPoland="https://www.rmf24.pl/fakty/polska"
    page= urllib.request.urlopen(rmfPoland)
    soup = bs(page, features="html.parser")
    # print(soup.prettify())
    headlines=soup.find_all("div", class_="boxBody")
    for headline in headlines:
        imgs=headline.find_all("img")
        for img in imgs:
            # print("RMF Polska: " + str(img['alt']))
            key = randomStringDigits(8)
            value = str(img['alt']).strip()
            rpHeadlines[key] = value
    return rpHeadlines
Esempio n. 9
0
def newsweek():
    nHeadlines = dict()
    newsweek = "https://www.newsweek.pl/"
    page = urllib.request.urlopen(newsweek)
    soup = bs(page, features="html.parser")
    # print(soup.prettify())

    headlines = soup.find_all("h2", class_="artTitle")
    for headline in headlines:
        # print(headline)
        if headline.string is not None:
            key = randomStringDigits(8)
            value = str(headline.string)
            nHeadlines[key] = value
    return nHeadlines
Esempio n. 10
0
def fronda():
    fHeadlines = dict()
    frondaurl = "http://www.fronda.pl/c/wiadomosci,1.html"
    hdr = {'User-Agent': 'Mozilla/5.0'}
    req = Request(frondaurl, headers=hdr)
    page = urlopen(req)
    soup = bs(page, features="html.parser")
    # print(soup.prettify())
    headlines = soup.find_all("h4")
    for headline in headlines:
        # print("Fronda: " + str(headline.string))
        key = randomStringDigits(8)
        value = str(headline.string)
        fHeadlines[key] = value
    return fHeadlines
Esempio n. 11
0
def nczas():
    ncHeadlines = dict()
    nczas = "https://nczas.com/"
    hdr = {'User-Agent': 'Mozilla/5.0'}
    req = Request(nczas, headers=hdr)
    page = urlopen(req)
    soup = bs(page, features="html.parser")
    headlines = soup.find_all("h3", class_="entry-title td-module-title")
    for headline in headlines:
        titles = headline.find_all("a")
        for title in titles:
            # print("NCzas: " + str(title['title']))
            key = randomStringDigits(8)
            value = str(title.string)
            ncHeadlines[key] = value
    return ncHeadlines
Esempio n. 12
0
def gazeta():
    gHeadlines = dict()
    gazeta = "http://wiadomosci.gazeta.pl/wiadomosci/0,0.html"
    page = urllib.request.urlopen(gazeta)
    soup = bs(page, features="html.parser")
    # print(soup.prettify())
    headlines = soup.find_all("li", class_="entry")
    for headline in headlines:
        titles = headline.find_all("a")
        for title in titles:
            if title.string is not None:
                # print("Gazeta: " + str(title.string))
                key = randomStringDigits(8)
                value = str(title.string)
                gHeadlines[key] = value
    return gHeadlines
Esempio n. 13
0
def wprost():
    wprHeadlines = dict()
    wprost = "https://www.wprost.pl/wiadomosci"
    page = urllib.request.urlopen(wprost)
    soup = bs(page, features="html.parser")
    # print(soup.prettify())
    headlines = soup.find_all("span")
    for headline in headlines:
        if headline.string is not None:
            if len(str(headline.string)) > 30:
                key = randomStringDigits(8)
                val = str(headline.string).replace(u'\xa0', ' ')
                wprHeadlines[key] = val
    return wprHeadlines


# print(wprost())
Esempio n. 14
0
def interia():
    iHeadlines = dict()
    interia = "https://fakty.interia.pl/"
    page = urllib.request.urlopen(interia)
    soup = bs(page, features="html.parser")
    headlines = soup.find_all("li")
    for headline in headlines:
        magazines = headline.find_all("div", class_="tile-magazine")
        for magazine in magazines:
            headers = magazine.find_all("div", class_="tile-magazine-header")
            for header in headers:
                titles = header.find_all("h2", class_="tile-magazine-title")
                for title in titles:
                    # print("Interia: " + str(title.find("a").string))
                    key = randomStringDigits(8)
                    value = str(title.find("a").string)
                    val = value.replace(u'\u200b', '')
                    iHeadlines[key] = val
    return iHeadlines