def tvn24(): t24Headlines=dict() tvn24="https://www.tvn24.pl/" page= urllib.request.urlopen(tvn24) soup = bs(page, features="html.parser") # print(soup.prettify()) #main news: mainHeadlines=soup.find_all("h1") for headline in mainHeadlines: tags=headline.find_all("a") for tag in tags: # print("TVN24: " + str(tag.get_text())) key = randomStringDigits(8) value = tag.get_text() t24Headlines[key] = value #side news: sideHeadlines=soup.find_all("h2") for headline in sideHeadlines: if headline.has_attr('class'): aTag=headline.find("a") if aTag is not None: if headline['class'][0]=="decorate-heading" and len(str(aTag.string))>30: # print("TVN24: " + str(aTag.string).strip()) key = randomStringDigits(8) value = str(aTag.get_text()).strip() t24Headlines[key] = value return t24Headlines
def tvpinfo(): tHeadlines = dict() tvp = "https://www.tvp.info/" page = urllib.request.urlopen(tvp) soup = bs(page, features="html.parser") # main news: mainHeadlines = soup.find_all("h1", class_="title") for headline in mainHeadlines: # print("TVP Info: " + str(headline.string).strip()) key = randomStringDigits(8) value = str(headline.string).strip() tHeadlines[key] = value # major news: majorNewsHeadlines = soup.find_all("h2", class_="news__title") for headline in majorNewsHeadlines: # print("TVP Info: " + str(headline.string).strip()) key = randomStringDigits(8) value = str(headline.string).strip() tHeadlines[key] = value # minor news: minorNewsHeadlines = soup.find_all("h3", class_="news__title") for headline in minorNewsHeadlines: # print("TVP Info: " + str(headline.string).strip()) key = randomStringDigits(8) value = str(headline.string).strip() tHeadlines[key] = value # info: infoHeadlines = soup.find_all("h3", class_="information__text") for headline in infoHeadlines: # print("TVP Info: " + str(headline.string).strip()) key = randomStringDigits(8) value = str(headline.string).strip() tHeadlines[key] = value # business: businessHeadlines = soup.find_all("h3", class_="business__subtitle") for headline in businessHeadlines: # print("TVP Info: " + str(headline.string).strip()) key = randomStringDigits(8) value = str(headline.string).strip() tHeadlines[key] = value return tHeadlines
def wpolityce(): wpolHeadlines=dict() wpolityce="https://wpolityce.pl" page= urllib.request.urlopen(wpolityce) soup = bs(page, features="html.parser") headlines=soup.find_all("span", class_="long-title") for headline in headlines: # print("WPolityce: " + str(headline.string)) key=randomStringDigits(8) val=str(headline.string) wpolHeadlines[key]=val return wpolHeadlines
def onet(): oHeadlines = dict() onet = "https://wiadomosci.onet.pl/" page = urllib.request.urlopen(onet) soup = bs(page, features="html.parser") headlines = soup.find_all("h3") for headline in headlines: # print("Onet: " + str(headline.string)) key = randomStringDigits(8) value = str(headline.string) oHeadlines[key] = value return oHeadlines
def rmfswiat(): rsHeadlines=dict() rmfWorld="https://www.rmf24.pl/fakty/swiat" page= urllib.request.urlopen(rmfWorld) soup = bs(page, features="html.parser") headlines=soup.find_all("div", class_="boxBody") for headline in headlines: imgs=headline.find_all("img") for img in imgs: key = randomStringDigits(8) value = str(img['alt']).strip().replace(u'\u200b','') rsHeadlines[key] = value return rsHeadlines
def wp(): wpHeadlines = dict() wp = "https://www.wp.pl/" page = urllib.request.urlopen(wp) soup = bs(page, features="html.parser") # print(soup.prettify()) # news type 1: headlines = soup.find_all("div", class_="sc-1bp8799-1 gqsna") for headline in headlines: # print("WP: " + str(headline.get_text())) key = randomStringDigits(8) val = str(headline.get_text()) wpHeadlines[key] = val # news type 2: headlines2 = soup.find_all("div", class_="lclzf3-0 egPcYF") for headline in headlines2: # print("WP: " + str(headline.get_text())) key = randomStringDigits(8) val = str(headline.get_text()).strip() wpHeadlines[key] = val.strip() return wpHeadlines
def polityka(): pHeadlines = dict() polityka = "https://www.polityka.pl/TygodnikPolityka" page = urllib.request.urlopen(polityka) soup = bs(page, features="html.parser") # side headlines sideHeadlines = soup.find_all("h3") for headline in sideHeadlines: if headline.string is not None: # print("Polityka: " + str(headline.string).strip()) key = randomStringDigits(8) value = str(headline.string).strip() pHeadlines[key] = value return pHeadlines
def rmfpolska(): rpHeadlines=dict() rmfPoland="https://www.rmf24.pl/fakty/polska" page= urllib.request.urlopen(rmfPoland) soup = bs(page, features="html.parser") # print(soup.prettify()) headlines=soup.find_all("div", class_="boxBody") for headline in headlines: imgs=headline.find_all("img") for img in imgs: # print("RMF Polska: " + str(img['alt'])) key = randomStringDigits(8) value = str(img['alt']).strip() rpHeadlines[key] = value return rpHeadlines
def newsweek(): nHeadlines = dict() newsweek = "https://www.newsweek.pl/" page = urllib.request.urlopen(newsweek) soup = bs(page, features="html.parser") # print(soup.prettify()) headlines = soup.find_all("h2", class_="artTitle") for headline in headlines: # print(headline) if headline.string is not None: key = randomStringDigits(8) value = str(headline.string) nHeadlines[key] = value return nHeadlines
def fronda(): fHeadlines = dict() frondaurl = "http://www.fronda.pl/c/wiadomosci,1.html" hdr = {'User-Agent': 'Mozilla/5.0'} req = Request(frondaurl, headers=hdr) page = urlopen(req) soup = bs(page, features="html.parser") # print(soup.prettify()) headlines = soup.find_all("h4") for headline in headlines: # print("Fronda: " + str(headline.string)) key = randomStringDigits(8) value = str(headline.string) fHeadlines[key] = value return fHeadlines
def nczas(): ncHeadlines = dict() nczas = "https://nczas.com/" hdr = {'User-Agent': 'Mozilla/5.0'} req = Request(nczas, headers=hdr) page = urlopen(req) soup = bs(page, features="html.parser") headlines = soup.find_all("h3", class_="entry-title td-module-title") for headline in headlines: titles = headline.find_all("a") for title in titles: # print("NCzas: " + str(title['title'])) key = randomStringDigits(8) value = str(title.string) ncHeadlines[key] = value return ncHeadlines
def gazeta(): gHeadlines = dict() gazeta = "http://wiadomosci.gazeta.pl/wiadomosci/0,0.html" page = urllib.request.urlopen(gazeta) soup = bs(page, features="html.parser") # print(soup.prettify()) headlines = soup.find_all("li", class_="entry") for headline in headlines: titles = headline.find_all("a") for title in titles: if title.string is not None: # print("Gazeta: " + str(title.string)) key = randomStringDigits(8) value = str(title.string) gHeadlines[key] = value return gHeadlines
def wprost(): wprHeadlines = dict() wprost = "https://www.wprost.pl/wiadomosci" page = urllib.request.urlopen(wprost) soup = bs(page, features="html.parser") # print(soup.prettify()) headlines = soup.find_all("span") for headline in headlines: if headline.string is not None: if len(str(headline.string)) > 30: key = randomStringDigits(8) val = str(headline.string).replace(u'\xa0', ' ') wprHeadlines[key] = val return wprHeadlines # print(wprost())
def interia(): iHeadlines = dict() interia = "https://fakty.interia.pl/" page = urllib.request.urlopen(interia) soup = bs(page, features="html.parser") headlines = soup.find_all("li") for headline in headlines: magazines = headline.find_all("div", class_="tile-magazine") for magazine in magazines: headers = magazine.find_all("div", class_="tile-magazine-header") for header in headers: titles = header.find_all("h2", class_="tile-magazine-title") for title in titles: # print("Interia: " + str(title.find("a").string)) key = randomStringDigits(8) value = str(title.find("a").string) val = value.replace(u'\u200b', '') iHeadlines[key] = val return iHeadlines