def get_text(stran,SOURCE_ID): sqlBase = dbExecutor() # creates a sql database handler class todayDateStr = datetime.datetime.now().strftime("%Y-%m-%d") # today date in the uniform format soup = BeautifulSoup(simple_get("http://www.mkgp.gov.si/si/medijsko_sredisce/sporocila_za_javnost/page/"+str(stran)), "html.parser") all_links = soup.find("div", {"id":"mainContainer2"}).find_all("a") for links in all_links: if(links.get("href")==None): continue if(re.match("http://www.mkgp.gov.si/si/medijsko_sredisce/novica+",links.get("href"))): soup = BeautifulSoup(simple_get(links.get("href")), "html.parser") naslov = soup.find("div", {"class":"article"}).find("h2").text datum = soup.find("time").text.split() s="" seq = (datum[0], datum[1], datum[2]) datum = uniformDateStr(s.join(seq)) vse = soup.find("div", {"class":"article"}).find_all("p") vsebina = "" for obj in vse: vsebina += str(obj.text).strip() + "\n" link = links.get("href") hashStr = makeHash(naslov, datum) # creates article hash from title and dateStr (HASH_VREDNOST) date_downloaded = todayDateStr # date when the article was downloaded if sqlBase.getByHash(hashStr) is None: description = vsebina entry = (datum, naslov, description, date_downloaded, hashStr, link, SOURCE_ID) sqlBase.insertOne(entry) # insert the article in the database print("Inserted succesfuly")
def get_text(stran,SOURCE_ID): sqlBase = dbExecutor() # creates a sql database handler class todayDateStr = datetime.datetime.now().strftime("%Y-%m-%d") # today date in the uniform format soup = BeautifulSoup(simple_get("http://raz.um.si/novice-in-dogodki/Strani/default.aspx#"), "html.parser") all_links = soup.find_all("a") for links in all_links: if(links.get("href")==None ): continue if(re.match("/novice-in-dogodki/novica/Strani/+",links.get("href"))): soup = BeautifulSoup(simple_get(parent_link+links.get("href")), "html.parser") naslov = soup.find("div", {"class":"naslov_novice"}).text.strip() datum = soup.find("div", {"class":"year"}).text.strip().split() vsebina = soup.find("div", {"class":"teloNovice"}).text.strip() datum = uniformDateStr(datum[1]) link = links.get("href") hashStr = makeHash(naslov, datum) # creates article hash from title and dateStr (HASH_VREDNOST) date_downloaded = todayDateStr # date when the article was downloaded if sqlBase.getByHash(hashStr) is None: description = vsebina entry = (datum, naslov, description, date_downloaded, hashStr, link, SOURCE_ID) sqlBase.insertOne(entry) # insert the article in the database print("Inserted succesfuly")
def get_text(stran,SOURCE_ID, startID): sqlBase = dbExecutor() # creates a sql database handler class todayDateStr = datetime.datetime.now().strftime("%Y-%m-%d") # today date in the uniform format soup = BeautifulSoup(simple_get(startID), "html.parser") all_links = soup.find_all("a") for links in all_links: if(links.get("href")==None): continue if(re.match("http://www.zgs.si/aktualno/sporocila_za_javnost/news_article/+",links.get("href")) or re.match("http://www.zgs.si/aktualno/novice/news_article/+",links.get("href"))): soup = BeautifulSoup(simple_get(links.get("href")), "html.parser").find("div", {"class":"news news-single"}) naslov = soup.find("h3").text datum = soup.find("time").text.split() datum = uniformDateStr(datum[0]) vse = soup.find_all("p") vsebina = "" for obj in vse: vsebina += str(obj.text) + "\n" link = links.get("href") hashStr = makeHash(naslov, datum) # creates article hash from title and dateStr (HASH_VREDNOST) date_downloaded = todayDateStr # date when the article was downloaded if sqlBase.getByHash(hashStr) is None: # get article description/content description = vsebina # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string) entry = (datum, naslov, description, date_downloaded, hashStr, link, SOURCE_ID) sqlBase.insertOne(entry) # insert the article in the database print("Inserted succesfuly")
def get_text(stran, SOURCE_ID): sqlBase = dbExecutor() # creates a sql database handler class todayDateStr = datetime.datetime.now().strftime( "%Y-%m-%d") # today date in the uniform format soup = BeautifulSoup( simple_get( "https://www.cerknica.si/GetPosts?page=" + str(stran) + "&keywords=®ion_id=&municipality_id=31&category_id=8&subcategory_id=&highlighted=False&datum_objave=" ), "html.parser") all_links = soup.find_all("a") tmp = 0 for links in all_links: if (links.get("href") == None): continue if (re.match("/objava/+", links.get("href")) and tmp == 0): try: soup = BeautifulSoup( simple_get(parent_link + links.get("href")), "html.parser") naslov = soup.find("h1", {"id": "main_title"}).text.strip() if (soup.find("div", {"id": "short_info"}) == None or soup.find("div", {"id": "short_info"}) == ''): datum = todayDateStr else: try: datum = uniformDateStr( soup.find("div", { "id": "short_info" }).text.strip()) except ValueError as e: datum = todayDateStr vse = soup.find("div", {"class": "tab-content"}).find_all("p") vsebina = "" for obj in vse: vsebina += str(obj.text).strip() + "\n" link = parent_link + links.get("href") hashStr = makeHash( naslov, datum ) # creates article hash from title and dateStr (HASH_VREDNOST) date_downloaded = todayDateStr # date when the article was downloaded if sqlBase.getByHash(hashStr) is None: description = vsebina entry = (datum, naslov, description, date_downloaded, hashStr, link, SOURCE_ID) sqlBase.insertOne( entry) # insert the article in the database print("Inserted succesfuly") except TypeError as e: print(e) if (tmp < 3): tmp += 1 else: tmp = 0 print(tmp)
def main(): articlesChecked = 0 # number of checked articles articlesDownloaded = 0 # number of downloaded articles # optionally set headers for the http request HEADERS = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36" } sqlBase = dbExecutor() # creates a sql database handler class todayDateStr = datetime.datetime.now().strftime("%Y-%m-%d") # today date in the uniform format with requests.Session() as s: # set every http/https request to retry max MAX_HTTP_RETRIES retries before returning an error if there is any complications with loading the page s.mount("http://", requests.adapters.HTTPAdapter(max_retries = MAX_HTTP_RETRIES)) # set max retries to: MAX_HTTP_RETRIES s.mount("https://", requests.adapters.HTTPAdapter(max_retries = MAX_HTTP_RETRIES)) # set max retries to: MAX_HTTP_RETRIES s.headers.update(HEADERS) # set headers of the session resp = s.get(POSTS_URL) soup = bs.BeautifulSoup(resp.text, "html.parser") articlesList = list() articlesList = getCurrentNews(s, soup, articlesList) articlesList = getArchivedNews(s, soup, articlesList) for article in articlesList: articlesChecked += 1 try: title = article[0] link = article[1] shortDate = article[2] hashStr = makeHash(title, shortDate) # if article is not yet saved in the database we add it if sqlBase.getByHash(hashStr) is None: # get article description/content # logger.debug("Getting article from url: {}".format(link)) description, dateStr = getArticleDescrAndDate(s, link) date_created = uniformDateStr(dateStr, "%d.%m.%Y") date_downloaded = todayDateStr # date when the article was downloaded # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string) entry = (date_created, title, description, date_downloaded, hashStr, link, SOURCE_ID) sqlBase.insertOne(entry, True) # insert the article in the database articlesDownloaded += 1 if articlesChecked % 5 == 0: logger.info("Checked: {} articles. Downloaded: {} new articles.".format(articlesChecked, articlesDownloaded)) if not firstRunBool and articlesChecked >= NUMBER_ARTICLES_TO_CHECK: break except Exception: logger.error("Url on which the error occured: {}".format(resp.url)) logger.exception("") sys.exit() logger.info("Downloaded {} new articles.".format(articlesDownloaded))
def get_text(stran, SOURCE_ID): sqlBase = dbExecutor() # creates a sql database handler class todayDateStr = datetime.datetime.now().strftime( "%Y-%m-%d") # today date in the uniform format soup = BeautifulSoup( simple_get("https://ptujinfo.com/lokalno?page=0%2C0%2C" + str(stran)), "html.parser") all_links = soup.find("div", {"class": "view__content"}).find_all("a") tmp = 0 for links in all_links: if (links.get("href") == None): continue if (re.match("/novica/lokalno+", links.get("href")) and tmp == 2): soup = BeautifulSoup(simple_get(parent_link + links.get("href")), "html.parser") naslov = soup.find("div", { "class": "before-main__left" }).find("h1").text datum = soup.find("div", { "class": "before-main__left" }).find("span", { "class": "date" }).text.split() s = "" seq = (datum[0], meseci[datum[1].lower() + ","], datum[2]) datum = uniformDateStr(s.join(seq)) podnaslov = soup.find( "div", {"class": "field field--name-field-podnaslov"}) if (podnaslov != None): podnaslov = podnaslov.text.strip() else: podnaslov = "" besedilo = soup.find("div", {"class": "field field--name-field-besedilo"}) if (besedilo != None): besedilo = besedilo.text.strip() else: besedilo = "" vsebina = podnaslov + "\n" + besedilo link = links.get("href") hashStr = makeHash( naslov, datum ) # creates article hash from title and dateStr (HASH_VREDNOST) date_downloaded = todayDateStr # date when the article was downloaded if sqlBase.getByHash(hashStr) is None: entry = (datum, naslov, vsebina, date_downloaded, hashStr, link, SOURCE_ID) sqlBase.insertOne(entry) # insert the article in the database print("Inserted succesfuly") if (tmp < 2): tmp += 1 else: tmp = 0
def get_text(stran, SOURCE_ID): sqlBase = dbExecutor() # creates a sql database handler class todayDateStr = datetime.datetime.now().strftime( "%Y-%m-%d") # today date in the uniform format soup = BeautifulSoup(simple_get("https://innorenew.eu/news/#"), "html.parser") all_links = soup.find("div").find_all("a") tmp = 0 for links in all_links: if (links.get("href") == None): continue if (re.match("https://innorenew.eu/[0-9][0-9][0-9][0-9/]+", links.get("href")) and tmp == 0): soup = BeautifulSoup(simple_get(links.get("href")), "html.parser") n_soup = soup.find("div", {"class": "col-md-7 col-md-push-5"}) naslov = n_soup.find("h2").text.strip() d_soup = soup.find("aside", {"class": "col-md-3 col-md-pull-7"}) soup = soup.find("article") dat = d_soup.find_all("p") datum = dat[1].text.split() s = "" seq = (datum[2], meseci[str(datum[1]).lower() + ","], datum[3]) datum = uniformDateStr(s.join(seq), "%d,%m.%Y") vse = soup.find_all("p") vsebina = "" tmp_bool = True for obj in vse: if (tmp_bool): tmp_bool = False continue vsebina += str(obj.text).strip() + "\n" link = links.get("href") hashStr = makeHash( naslov, datum ) # creates article hash from title and dateStr (HASH_VREDNOST) date_downloaded = todayDateStr # date when the article was downloaded if sqlBase.getByHash(hashStr) is None: # get article description/content description = vsebina # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string) entry = (datum, naslov, description, date_downloaded, hashStr, link, SOURCE_ID) sqlBase.insertOne(entry) # insert the article in the database print("Inserted succesfuly") if (tmp < 2): tmp += 1 else: tmp = 0
def get_first_text(SOURCE_ID): sqlBase = dbExecutor() # creates a sql database handler class todayDateStr = datetime.datetime.now().strftime( "%Y-%m-%d") # today date in the uniform format soup = BeautifulSoup(simple_get("https://www.vestnik.si/aktualno"), "html.parser") news = soup.find_all("div", {"class", "row w-clearfix"}) for n in news: all_links = n.find_all("a") tmp = 0 for links in all_links: if (links.get("href") == None): continue if (re.match(".*-[0-9]+", links.get("href")) and tmp == 0): try: soup = BeautifulSoup( simple_get(parent_link + links.get("href")), "html.parser") naslov = soup.find("div", { "class": "main-column" }).find("h1").text.strip() datum = soup.find_all("div", {"class": "avtordatum"})[1].text datum = uniformDateStr(datum) vse = soup.find("div", { "class": "main-col w-clearfix" }).find_all("p") vsebina = "" for obj in vse: vsebina += str(obj.text).strip() + "\n" link = parent_link + links.get("href") hashStr = makeHash( naslov, datum ) # creates article hash from title and dateStr (HASH_VREDNOST) date_downloaded = todayDateStr # date when the article was downloaded if sqlBase.getByHash(hashStr) is None: description = vsebina entry = (datum, naslov, description, date_downloaded, hashStr, link, SOURCE_ID) sqlBase.insertOne( entry) # insert the article in the database print("Inserted succesfuly") except TypeError as e: print(e) if (tmp < 1): tmp += 1 else: tmp = 0
def get_text(stran, SOURCE_ID): sqlBase = dbExecutor() # creates a sql database handler class todayDateStr = datetime.datetime.now().strftime( "%Y-%m-%d") # today date in the uniform format tmp = 0 soup = BeautifulSoup( simple_get("https://www.tednik.si/tednik?limit=9&start=" + str(stran)), "html.parser") all_links = soup.find("div", {"class": "t3-content"}).find_all("a") for links in all_links: if (links.get("href") == None): continue if (re.match("/tednik/+", links.get("href")) and tmp == 0): soup = BeautifulSoup(simple_get(parent_link + links.get("href")), "html.parser") naslov = soup.find("h1", {"class": "article-title"}).text.strip() datum = soup.find("div", { "class": "article-main" }).find("time").text.strip() datum = datum.split() vse = soup.find("div", { "class": "article-content-main" }).find_all("p") vsebina = "" for obj in vse: vsebina += str(obj.text).strip() + "\n" s = "" seq = (datum[1] + ".", meseci[datum[2].lower()], datum[3]) datum = uniformDateStr(s.join(seq)) link = links.get("href") hashStr = makeHash( naslov, datum ) # creates article hash from title and dateStr (HASH_VREDNOST) date_downloaded = todayDateStr # date when the article was downloaded if sqlBase.getByHash(hashStr) is None: description = vsebina entry = (datum, naslov, description, date_downloaded, hashStr, link, SOURCE_ID) sqlBase.insertOne(entry) # insert the article in the database print("Inserted succesfuly") if (tmp < 2): tmp += 1 else: tmp = 0
def get_text(stran,SOURCE_ID): sqlBase = dbExecutor() # creates a sql database handler class todayDateStr = datetime.datetime.now().strftime("%Y-%m-%d") # today date in the uniform format soup = BeautifulSoup(simple_get("https://www.pzs.si/novice.php?stran="+str(stran)+"&limit=20&iskanec=&tip=1"), "html.parser") all_links = soup.find_all("table",{"class":"round7"})[4].find_all("a") tmp=0 for links in all_links: if(links.get("href")==None): continue if(re.match("novice\.php\?pid=[0-9]+",links.get("href")) and tmp == 0): soup = BeautifulSoup(simple_get(parent_link+links.get("href")), "html.parser") soup = soup.find_all("table",{"class":"round7"})[4].find("div", {"class":"poravnava"}) naslov = soup.find("h2").text datum = soup.find("b").text.split(",")[1].split() s="" seq = (datum[0], meseci[datum[1]+","], datum[2]) datum = uniformDateStr(s.join(seq)) vse = soup.find("table").find_all("tr") tm=0 vsebina="" for t in vse: if(t.find("td", {"class":"poravnava"})): vsebina += vse[tm-1].text.strip() +"\n" vsebina += t.find("td", {"class":"poravnava"}).text.strip() tm+=1 link = links.get("href") hashStr = makeHash(naslov, datum) # creates article hash from title and dateStr (HASH_VREDNOST) date_downloaded = todayDateStr # date when the article was downloaded if sqlBase.getByHash(hashStr) is None: # get article description/content description = vsebina # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string) entry = (datum, naslov, description, date_downloaded, hashStr, link, SOURCE_ID) sqlBase.insertOne(entry) # insert the article in the database print("Inserted succesfuly") if (tmp < 1): tmp += 1 else: tmp = 0
def get_text(stran, SOURCE_ID): sqlBase = dbExecutor() # creates a sql database handler class todayDateStr = datetime.datetime.now().strftime( "%Y-%m-%d") # today date in the uniform format soup = BeautifulSoup( simple_get("https://www.notranjski-park.si/novice-dogodki?cur_page=" + str(stran)), "html.parser") all_links = soup.find_all("a") for links in all_links: if (links.get("href") == None): continue #print("++++++++") if (re.match("https://www.notranjski-park.si/novice-dogodki/+", links.get("href"))): soup = BeautifulSoup(simple_get(links.get("href")), "html.parser") naslov = soup.find("h1").text datum = soup.find("li", {"class": "date"}).text datum = datum.split() if (len(datum) == 0): datum = "2018-01-01" elif (len(datum) == 3 and datum[1] == "ob"): datum = uniformDateStr(datum[0]) else: s = "" seq = (datum[0], meseci[datum[1]], datum[2]) datum = uniformDateStr(s.join(seq)) vse = soup.find("div", {"class": "container small"}).find_all("p") vsebina = "" for obj in vse: vsebina += str(obj.text).strip() + "\n" link = links.get("href") hashStr = makeHash( naslov, datum ) # creates article hash from title and dateStr (HASH_VREDNOST) date_downloaded = todayDateStr # date when the article was downloaded if sqlBase.getByHash(hashStr) is None: # get article description/content description = vsebina # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string) entry = (datum, naslov, description, date_downloaded, hashStr, link, SOURCE_ID) sqlBase.insertOne(entry) # insert the article in the database print("Inserted succesfuly")
def get_text(stran, SOURCE_ID): sqlBase = dbExecutor() # creates a sql database handler class todayDateStr = datetime.datetime.now().strftime( "%Y-%m-%d") # today date in the uniform format all_links = "" try: soup = BeautifulSoup( simple_get("https://www.vestnik.si/aktualno?page=" + str(stran)), "html.parser") all_links = soup.find("div", { "class": "content-container" }).find_all("a") except TypeError as e: print(e) for links in all_links: try: soup = BeautifulSoup(simple_get(parent_link + links.get("href")), "html.parser") try: naslov = soup.find("div", { "class": "main-column" }).find("h1").text.strip() datum = soup.find_all("div", {"class": "avtordatum"})[1].text datum = uniformDateStr(datum) vse = soup.find("div", { "class": "main-col w-clearfix" }).find_all("p") vsebina = "" for obj in vse: vsebina += str(obj.text).strip() + "\n" link = parent_link + links.get("href") hashStr = makeHash( naslov, datum ) # creates article hash from title and dateStr (HASH_VREDNOST) date_downloaded = todayDateStr # date when the article was downloaded except AttributeError as e: print(e) if sqlBase.getByHash(hashStr) is None: description = vsebina entry = (datum, naslov, description, date_downloaded, hashStr, link, SOURCE_ID) sqlBase.insertOne(entry) # insert the article in the database print("Inserted succesfuly") except TypeError as e: print(e)
def get_text(stran, SOURCE_ID): sqlBase = dbExecutor() # creates a sql database handler class todayDateStr = datetime.datetime.now().strftime( "%Y-%m-%d") # today date in the uniform format soup = BeautifulSoup( simple_get( "http://notranjskoprimorske.si/kategorije/vse-novice/page/" + str(stran)), "html.parser") all_links = soup.find_all("a") tmp = 0 for links in all_links: if (links.get("href") == None): continue if (re.match("http://notranjskoprimorske.si/[0-9][0-9][0-9][0-9]/+", links.get("href")) and tmp == 0): #print("----------------------") soup = BeautifulSoup(simple_get(links.get("href")), "html.parser") naslov = soup.find("h1", {"class": "entry-title"}).text datum = soup.find("span", { "class": "td-post-date" }).find("time").text.split() s = "" seq = (datum[1], meseci[datum[2]], datum[3]) datum = uniformDateStr(s.join(seq)) vse = soup.find("div", {"class": "td-post-content"}).find_all("p") vsebina = "" for obj in vse: vsebina += str(obj.text).strip() + "\n" link = links.get("href") hashStr = makeHash( naslov, datum ) # creates article hash from title and dateStr (HASH_VREDNOST) date_downloaded = todayDateStr # date when the article was downloaded if sqlBase.getByHash(hashStr) is None: description = vsebina entry = (datum, naslov, description, date_downloaded, hashStr, link, SOURCE_ID) sqlBase.insertOne(entry) # insert the article in the database print("Inserted succesfuly") if (tmp < 1): tmp += 1 else: tmp = 0
def get_text(stran,SOURCE_ID,leto): sqlBase = dbExecutor() # creates a sql database handler class todayDateStr = datetime.datetime.now().strftime("%Y-%m-%d") # today date in the uniform format soup = BeautifulSoup(simple_get("https://www.pomurec.com/go/203/arhiv/"+str(leto)+"/"+str(stran)), "html.parser") all_links = soup.find("div",{"id":"vsebina"}).find_all("a") tmp = 0 for links in all_links: print(links["href"]) if(links.get("href")==None): continue if(re.match("/vsebina/+",links.get("href")) and tmp == 0): soup = BeautifulSoup(simple_get(parent_link + links.get("href")), "html.parser") naslov = soup.find("div",{"id":"vsebina"}).find("h3").text.strip() datum = soup.find("div", {"id": "vsebina"}).find("p",{"class":"datum"}).text.strip().split(",") if (len(datum) == 1): if(datum[0]==''): if (tmp < 3): tmp += 1 else: tmp = 0 continue datum =datum[0].split() datum = uniformDateStr(datum[0]) elif (len(datum) == 2 ): datum = datum[1].split() datum = uniformDateStr(datum[0]) vse = soup.find("div", {"id": "vsebina"}).find("ul",{"id":"clanek"}).find_all("p") vsebina = "" for obj in vse: vsebina += str(obj.text).strip() + "\n" link = links.get("href") hashStr = makeHash(naslov, datum) # creates article hash from title and dateStr (HASH_VREDNOST) date_downloaded = todayDateStr # date when the article was downloaded if sqlBase.getByHash(hashStr) is None: description = vsebina entry = (datum, naslov, description, date_downloaded, hashStr, link, SOURCE_ID) sqlBase.insertOne(entry) # insert the article in the database print("Inserted succesfuly") if (tmp < 3): tmp += 1 else: tmp = 0
def get_text(stran, SOURCE_ID): sqlBase = dbExecutor() # creates a sql database handler class todayDateStr = datetime.datetime.now().strftime( "%Y-%m-%d") # today date in the uniform format soup = BeautifulSoup( simple_get("http://www.danube-region.eu/communication/news?start=" + str(stran)), "html.parser") all_links = soup.find("main", {"id": "content"}).find_all("a") for links in all_links: if (links.get("href") == None): continue if (re.match("/communication/news/+", links.get("href"))): soup = BeautifulSoup(simple_get(parent_link + links.get("href")), "html.parser") naslov = soup.find("div", {"class", "page-header" }).find("h2").text.strip().split("\n")[0] datum = soup.find("dd", { "class": "published" }).text.split(":")[1].split() s = "" seq = (datum[0] + ".", meseci[datum[1] + ","], datum[2]) datum = uniformDateStr(s.join(seq)) vse = soup.find("div", {"class", "item-page"}).find_all("p") vsebina = "" for p in vse: vsebina += p.text.strip() link = links.get("href") hashStr = makeHash( naslov, datum ) # creates article hash from title and dateStr (HASH_VREDNOST) date_downloaded = todayDateStr # date when the article was downloaded if sqlBase.getByHash(hashStr) is None: # get article description/content description = vsebina # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string) entry = (datum, naslov, description, date_downloaded, hashStr, link, SOURCE_ID) sqlBase.insertOne(entry) # insert the article in the database print("Inserted succesfuly")
def get_text(stran, SOURCE_ID): sqlBase = dbExecutor() # creates a sql database handler class todayDateStr = datetime.datetime.now().strftime( "%Y-%m-%d") # today date in the uniform format soup = BeautifulSoup( simple_get("http://www.bistra.si/aktualne-novice/novice?start=" + str(stran)), "html.parser") all_links = soup.find_all("a") #print(all_links) for links in all_links: if (links.get("href") == None): continue if (re.match("/aktualne-novice/novice/+", links.get("href"))): soup = BeautifulSoup(simple_get(parent_link + links.get("href")), "html.parser") content = str(soup.find("div", {"class": "articleBody"}).text) title = str(soup.find("div", {"class": "entry-header"}).text) datestr = str(soup.find("dd", {"class": "create"}).text) title = re.sub('\t+', '', title) title = re.sub('\n+', '', title) datestr = re.sub('\t+', '', datestr) datestr = re.sub('\n+', '', datestr) datestr = datestr.split() datestr = ''.join([datestr[1], meseci[datestr[2]], datestr[3]]) datestr = uniformDateStr(datestr, "%d.%m.%Y") link = parent_link + links.get("href") hashStr = makeHash( title, datestr ) # creates article hash from title and dateStr (HASH_VREDNOST) date_downloaded = todayDateStr # date when the article was downloaded if sqlBase.getByHash(hashStr) is None: # get article description/content description = content entry = (datestr, title, description, date_downloaded, hashStr, link, SOURCE_ID) sqlBase.insertOne(entry) # insert the article in the database print("Inserted succesfuly")
def get_text(stran,SOURCE_ID): sqlBase = dbExecutor() # creates a sql database handler class todayDateStr = datetime.datetime.now().strftime("%Y-%m-%d") # today date in the uniform format soup = BeautifulSoup(simple_get("https://www.rtvslo.si/lokalne-novice/ljubljana/arhiv/?&page="+str(stran)), "html.parser") all_links = soup.find_all("a",{"class":"title"}) prev_link = "" for links in all_links: if(prev_link == links): continue prev_link = links if(links.get("href")==None): continue if(not re.match("http://+",links.get("href"))): #print("----------------------") print(parent_link+links.get("href")) soup = BeautifulSoup(simple_get(parent_link+links.get("href")), "html.parser") tmp = soup.find("div", {"id":"newsbody"}) if(tmp == None): continue naslov = str(tmp.find("h1").text) vse = tmp.find_all("p") vsebina="" for obj in vse: vsebina+=str(obj.text)+"\n" dat = str(tmp.find("div", {"class":"info"}).text) dat= dat.split() s="" seq = (dat[0], meseci[dat[1]], dat[2]) datum = uniformDateStr(s.join(seq)) link = parent_link+links.get("href") hashStr = makeHash(naslov, datum) # creates article hash from title and dateStr (HASH_VREDNOST) date_downloaded = todayDateStr # date when the article was downloaded if sqlBase.getByHash(hashStr) is None: # get article description/content description = vsebina # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string) entry = (datum, naslov, description, date_downloaded, hashStr, link, SOURCE_ID) sqlBase.insertOne(entry) # insert the article in the database print("Inserted succesfuly")
def get_text(stran, SOURCE_ID): sqlBase = dbExecutor() # creates a sql database handler class todayDateStr = datetime.datetime.now().strftime( "%Y-%m-%d") # today date in the uniform format soup = BeautifulSoup( simple_get("https://www.izvoznookno.si/aktualno?p=" + str(stran) + "&t1=1&c=&s=&pn=&t4=4&q="), "html.parser") all_links = soup.find_all("a") for links in all_links: if (links.get("href") == None): continue if (re.match("/Aktualno/+", links.get("href"))): soup = BeautifulSoup(simple_get(parent_link + links.get("href")), "html.parser") naslov = soup.find("article", { "class": "col-md-12" }).find("h1").text datum = soup.find("article", { "class": "col-md-12" }).find("date").text.split() s = "" seq = (datum[0], meseci[datum[1]], datum[2]) datum = uniformDateStr(s.join(seq)) vsebina = soup.find("div", {"class": "user-html"}).text link = parent_link + links.get("href") hashStr = makeHash( naslov, datum ) # creates article hash from title and dateStr (HASH_VREDNOST) date_downloaded = todayDateStr # date when the article was downloaded if sqlBase.getByHash(hashStr) is None: description = vsebina entry = (datum, naslov, description, date_downloaded, hashStr, link, SOURCE_ID) sqlBase.insertOne(entry) # insert the article in the database print("Inserted succesfuly")
def get_text(stran, SOURCE_ID): sqlBase = dbExecutor() # creates a sql database handler class todayDateStr = datetime.datetime.now().strftime( "%Y-%m-%d") # today date in the uniform format soup = BeautifulSoup( simple_get("http://p-tech.si/category/novice/page/" + str(stran)), "html.parser") all_links = soup.find_all("div") tmp = 0 for links in all_links: if (links.get("id") == None): continue if (re.match("post+", links.get("id"))): naslov = links.find("h2").text datum = links.find("span", {"class": "published"}).text.split() s = "" seq = (datum[0], meseci[datum[1]], datum[2]) datum = uniformDateStr(s.join(seq)) hashStr = makeHash( naslov, datum ) # creates article hash from title and dateStr (HASH_VREDNOST) if sqlBase.getByHash(hashStr) is None: vsebina = links.find("div", { "class": "page-content" }).text.strip() link = links.find("a").get("href") date_downloaded = todayDateStr # date when the article was downloaded # get article description/content description = vsebina # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string) entry = (datum, naslov, description, date_downloaded, hashStr, link, SOURCE_ID) sqlBase.insertOne(entry) # insert the article in the database print("Inserted succesfuly")
def main(): articlesChecked = 0 # number of checked articles articlesDownloaded = 0 # number of downloaded articles # optionally set headers for the http request HEADERS = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36" } sqlBase = dbExecutor() # creates a sql database handler class todayDateStr = datetime.datetime.now().strftime("%Y-%m-%d") # today date in the uniform format with requests.Session() as s: # set every http/https request to retry max MAX_HTTP_RETRIES retries before returning an error if there is any complications with loading the page s.mount("http://", requests.adapters.HTTPAdapter(max_retries = MAX_HTTP_RETRIES)) # set max retries to: MAX_HTTP_RETRIES s.mount("https://", requests.adapters.HTTPAdapter(max_retries = MAX_HTTP_RETRIES)) # set max retries to: MAX_HTTP_RETRIES s.headers.update(HEADERS) # set headers of the session pages = [BASE_URL+"/sl/informacija.asp?id_meta_type=54&type_informacij=0", BASE_URL+"/sl/informacija.asp?id_meta_type=68&type_informacij=0"] resp = s.get(pages[0]) soup = bs.BeautifulSoup(resp.text, "html.parser") allNewsLinksHtml = soup.find_all("span", class_="vec") resp = s.get(pages[1]) soup = bs.BeautifulSoup(resp.text, "html.parser") allNewsLinksHtml2 = soup.find_all("span", class_="vec") # makes a one list out of two # list1: [1,2,3,4] # list2: [5,6,7,8] # output list: [1,5,2,6,3,7,4,8] for num, pg in enumerate(allNewsLinksHtml2): allNewsLinksHtml.insert(num*2, pg) for newsLink in allNewsLinksHtml: try: articlesChecked += 1 link = BASE_URL+"/sl/"+newsLink.find("a")["href"] resp = s.get(link) soup = bs.BeautifulSoup(resp.text, "html.parser") subPage = soup.find("div", class_="Vsebina") title = subPage.find("h2").text dateStr = parseDate(subPage.find_all("p")) hashStr = makeHash(title, dateStr) description = subPage.text # # print ("date created:", dateStr) date_created = uniformDateStr(dateStr, "%d.%m.%Y") # date when the article was published on the page date_downloaded = todayDateStr # date when the article was downloaded # if article is not yet saved in the database we add it if sqlBase.getByHash(hashStr) is None: # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string) entry = (date_created, title, description, date_downloaded, hashStr, link, SOURCE_ID) sqlBase.insertOne(entry, True) # insert the article in the database articlesDownloaded += 1 if articlesChecked % 5 == 0: logger.info("Checked: {} articles. Downloaded: {} new articles.".format(articlesChecked, articlesDownloaded)) if not firstRunBool and articlesChecked >= NUMBER_ARTICLES_TO_CHECK: break except Exception as e: logger.error("Url on which the error occured: {}".format(resp.url)) logger.exception("") sys.exit() logger.info("Downloaded {} new articles.".format(articlesDownloaded))
def main(): pagesChecked = 0 # number of checked pages articlesChecked = 0 # number of checked articles articlesDownloaded = 0 # number of downloaded articles # optionally set headers for the http request HEADERS = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36" } sqlBase = dbExecutor() # creates a sql database handler class todayDateStr = datetime.datetime.now().strftime("%Y-%m-%d") # today date in the uniform format # creates a session with requests.Session() as s: # set every http/https request to retry max MAX_HTTP_RETRIES retries before returning an error if there is any complications with loading the page s.mount("http://", requests.adapters.HTTPAdapter(max_retries = MAX_HTTP_RETRIES)) # set max retries to: MAX_HTTP_RETRIES s.mount("https://", requests.adapters.HTTPAdapter(max_retries = MAX_HTTP_RETRIES)) # set max retries to: MAX_HTTP_RETRIES s.headers.update(HEADERS) # set headers of the session resp = s.get(BASE_URL+"/mediji/Novice/ArticlePage/1") soup = bs.BeautifulSoup(resp.text, "html.parser") logger.info("Checking page 1") lastPageNum = int(soup.find("div", class_="pager").find_all("a")[-2].text) for pageNum in range(2, lastPageNum+1, 1): try: pagesChecked += 1 # find all ~15 articles on current page articles = soup.find_all("div", class_="article") for article in articles: articlesChecked += 1 title = article.find("h1", class_="heading").find("a").text # finds article title link = article.find("h1", class_="heading").find("a")["href"] # finds article http link dateStr = article.find("div", class_="metadata") # finds article date (DATUM_VNOSA) [x.extract() for x in dateStr.find_all('span', class_="pull-right")] date_created = parseDate(dateStr.text) hashStr = makeHash(title, date_created) # creates article hash from title and dateStr (HASH_VREDNOST) date_downloaded = todayDateStr # date when the article was downloaded # if article is not yet saved in the database we add it if sqlBase.getByHash(hashStr) is None: # get article description/content description = getArticleDescr(s, link) # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string) entry = (date_created, title, description, date_downloaded, hashStr, link, SOURCE_ID) sqlBase.insertOne(entry, True) # insert the article in the database articlesDownloaded += 1 if articlesChecked % 5 == 0: logger.info("Checked: {} articles. Downloaded: {} new articles.".format(articlesChecked, articlesDownloaded)) # firstRunBool = True # DEBUG! if not firstRunBool and pagesChecked >= NUM_PAGES_TO_CHECK: break # find next page resp = s.get(BASE_URL+"/mediji/Novice/ArticlePage/"+str(pageNum)) soup = bs.BeautifulSoup(resp.text, "html.parser") logger.info("Checking page: {}".format(pageNum)) except Exception as e: logger.error("Url on which the error occured: {}".format(resp.url)) logger.exception("") sys.exit() logger.info("Downloaded {} new articles.".format(articlesDownloaded))
def main(): articlesChecked = 0 articlesDownloaded = 0 # number of downloaded articles sqlBase = dbExecutor() # creates a sql database handler class todayDateStr = datetime.datetime.now().strftime("%Y-%m-%d") # today date in the uniform format yearInt = datetime.datetime.now().year # optionally set headers for the http request HEADERS = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36" } with requests.Session() as s: # set every http/https request to retry max MAX_HTTP_RETRIES retries before returning an error if there is any complications with loading the page s.mount("http://", requests.adapters.HTTPAdapter(max_retries = MAX_HTTP_RETRIES)) # set max retries to: MAX_HTTP_RETRIES s.mount("https://", requests.adapters.HTTPAdapter(max_retries = MAX_HTTP_RETRIES)) # set max retries to: MAX_HTTP_RETRIES s.headers.update(HEADERS) # set headers of the session for subPage in ["/obvestila-za-obcane/", "/sporocila-za-javnost/"]: logger.info("First checking subpage: {}".format(subPage)) maxYearToCheck = MAX_YEAR-1 if not firstRunBool: maxYearToCheck = yearInt-1 for yearNum in range(yearInt, maxYearToCheck, -1): logger.info("Checking year: {}".format(yearNum)) pagelink = BASE_URL+subPage+str(yearNum) try: resp = s.get(pagelink) soup = bs.BeautifulSoup(resp.text, "html.parser") monthListings = soup.find_all("div", class_="newsArchive-group") for monthListing in monthListings: articleListings = monthListing.find_all("div", class_="news-listing-item") for article in articleListings: articlesChecked += 1 title = article.find("h2").text link = BASE_URL+"/"+str(article.find("h2").find("a")["href"]) dateStr = article.find("div", class_="news-listing-item-date").text hashStr = makeHash(title, dateStr) date_created = uniformDateStr(dateStr, "%d. %m. %Y") # date when the article was published on the page date_downloaded = todayDateStr # date when the article was downloaded # if article is not yet saved in the database we add it if sqlBase.getByHash(hashStr) is None: # get article description/content description = getArticleDescr(s, link) # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string) entry = (date_created, title, description, date_downloaded, hashStr, link, SOURCE_ID) sqlBase.insertOne(entry, True) # insert the article in the database articlesDownloaded += 1 if articlesChecked % 5 == 0: logger.info("Checked: {} articles. Downloaded: {} new articles.".format(articlesChecked, articlesDownloaded)) except Exception as e: logger.error("Url on which the error occured: {}".format(resp.url)) logger.exception("") sys.exit() logger.info("Downloaded {} new articles.".format(articlesDownloaded))
def main(): pagesChecked = 0 # number of checked pages articlesChecked = 0 # number of checked articles articlesDownloaded = 0 # number of downloaded articles # optionally set headers for the http request HEADERS = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36" } sqlBase = dbExecutor() # creates a sql database handler class todayDateStr = datetime.datetime.now().strftime( "%Y-%m-%d") # today date in the uniform format # creates a session with requests.Session() as s: pageStart = 0 # set at which page (article) to start # set every http/https request to retry max MAX_HTTP_RETRIES retries before returning an error if there is any complications with loading the page s.mount("http://", requests.adapters.HTTPAdapter(max_retries=MAX_HTTP_RETRIES) ) # set max retries to: MAX_HTTP_RETRIES s.mount("https://", requests.adapters.HTTPAdapter(max_retries=MAX_HTTP_RETRIES) ) # set max retries to: MAX_HTTP_RETRIES s.headers.update(HEADERS) # set headers of the session # send get request to the http page (if you need a post request you could also use s.post(...)) resp = s.get(BASE_URL + "/si/novice/") # adds the html text of the http response to the BeautifulSoup parser soup = bs.BeautifulSoup(resp.text, "html.parser") # find "next page" button link - to import all the news recursive try: nextPageLink = BASE_URL + "/" + soup.find( "div", class_="pager").find_all("a")[-1]["href"] except KeyError as e: logger.exception("") while nextPageLink != None: try: pagesChecked += 1 # find all ~15 articles on current page articles = soup.find_all("div", class_="newswrapp") for article in articles: articlesChecked += 1 textPart = article.find("div", class_="newstext") title = textPart.find("h2").find( "a").text # finds article title link = BASE_URL + textPart.find("h2").find("a")[ "href"] # finds article http link date_created = parseDate( textPart.find("div", class_="date").text ) # finds article date (DATUM_VNOSA) hashStr = makeHash( title, date_created ) # creates article hash from title and dateStr (HASH_VREDNOST) date_downloaded = todayDateStr # date when the article was downloaded # if article is not yet saved in the database we add it if sqlBase.getByHash(hashStr) is None: # get article description/content description = getArticleDescr(s, link) # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string) entry = (date_created, title, description, date_downloaded, hashStr, link, SOURCE_ID) sqlBase.insertOne( entry, True) # insert the article in the database articlesDownloaded += 1 if articlesChecked % 5 == 0: logger.info( "Checked: {} articles. Downloaded: {} new articles." .format(articlesChecked, articlesDownloaded)) # find next page resp = s.get(nextPageLink) # loads next page soup = bs.BeautifulSoup( resp.text, "html.parser") # add html text to the soup try: nextPageLink = BASE_URL + "/" + soup.find( "div", class_="pager").find_all("a")[-1][ "href"] # select the "next page" button http link except AttributeError as e: nextPageLink = None logger.exception("") except KeyError as e: nextPageLink = None logger.exception("") if not firstRunBool and pagesChecked >= NUM_PAGES_TO_CHECK: break except Exception as e: logger.error("Url on which the error occured: {}".format( resp.url)) logger.exception("") sys.exit() resp = s.get(BASE_URL + "/si/dogodki/") # adds the html text of the http response to the BeautifulSoup parser soup = bs.BeautifulSoup(resp.text, "html.parser") pagesChecked += 1 # find all ~15 articles on current page articles = soup.find_all("div", class_="newswrapp") for article in articles: try: articlesChecked += 1 textPart = article.find("div", class_="newstext") title = textPart.find("h2").find( "a").text # finds article title link = BASE_URL + textPart.find("h2").find("a")[ "href"] # finds article http link date_created = "" # finds article date (DATUM_VNOSA) hashStr = makeHash( title, date_created ) # creates article hash from title and dateStr (HASH_VREDNOST) date_created = None date_downloaded = todayDateStr # date when the article was downloaded # if article is not yet saved in the database we add it if sqlBase.getByHash(hashStr) is None: # get article description/content description = getArticleDescr(s, link) # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string) entry = (date_created, title, description, date_downloaded, hashStr, link, SOURCE_ID) sqlBase.insertOne( entry, True) # insert the article in the database articlesDownloaded += 1 if articlesChecked % 5 == 0: logger.info( "Checked: {} articles. Downloaded: {} new articles.". format(articlesChecked, articlesDownloaded)) if not firstRunBool and articlesChecked >= NUM_ARTICLES_TO_CHECK: break except Exception as e: logger.exception("") logger.info("Downloaded {} new articles.".format(articlesDownloaded))
def main(): pagesChecked = 0 # number of checked pages articlesChecked = 0 # number of checked articles articlesDownloaded = 0 # number of downloaded articles # optionally set headers for the http request HEADERS = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36" } sqlBase = dbExecutor() # creates a sql database handler class todayDateStr = datetime.datetime.now().strftime( "%Y-%m-%d") # today date in the uniform format # creates a session with requests.Session() as s: # set every http/https request to retry max MAX_HTTP_RETRIES retries before returning an error if there is any complications with loading the page s.mount("http://", requests.adapters.HTTPAdapter(max_retries=MAX_HTTP_RETRIES) ) # set max retries to: MAX_HTTP_RETRIES s.mount("https://", requests.adapters.HTTPAdapter(max_retries=MAX_HTTP_RETRIES) ) # set max retries to: MAX_HTTP_RETRIES s.headers.update(HEADERS) # set headers of the session resp = s.get(SUB_PAGES_URL + "1") soup = bs.BeautifulSoup(resp.text, "html.parser") # find "next page" button link - to import all the news recursive nextPageLink = soup.find("div", class_="pagination clearfix").find( "div", class_="alignleft").find( "a") # searches tag "span" with class "next" pageStart = 1 # set at which page (article) to start while nextPageLink != None: try: pagesChecked += 1 # find all ~15 articles on current page articles = soup.find_all("article") for article in articles: articlesChecked += 1 title = article.find( "h2", class_="entry-title").text # finds article title link = article.find("h2", class_="entry-title").find("a")[ "href"] # finds article http link dateStr = "" hashStr = makeHash( title, dateStr ) # creates article hash from title and dateStr (HASH_VREDNOST) date_created = None # date when the article was published on the page date_downloaded = todayDateStr # date when the article was downloaded # if article is not yet saved in the database we add it if sqlBase.getByHash(hashStr) is None: # get article description/content description = getArticleDescr(s, link) # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string) entry = (date_created, title, description, date_downloaded, hashStr, link, SOURCE_ID) sqlBase.insertOne( entry, True) # insert the article in the database articlesDownloaded += 1 if articlesChecked % 5 == 0: logger.info( "Checked: {} articles. Downloaded: {} new articles." .format(articlesChecked, articlesDownloaded)) # find next page try: nextPageLink = soup.find( "div", class_="pagination clearfix" ).find("div", class_="alignleft").find("a")[ "href"] # select the "next page" button http link logger.debug("Checking page: {}".format(nextPageLink)) resp = s.get(nextPageLink) # loads next page soup = bs.BeautifulSoup( resp.text, "html.parser") # add html text to the soup except Exception: logger.exception("Can not find next page") if not firstRunBool and pagesChecked >= NUM_PAGES_TO_CHECK - 1: break except Exception: logger.error("Url on which the error occured: {}".format( resp.url)) logger.exception("") sys.exit() logger.info("Downloaded {} new articles.".format(articlesDownloaded))
def main(): pagesChecked = 0 # number of checked pages articlesChecked = 0 # number of checked articles articlesDownloaded = 0 # number of downloaded articles # optionally set headers for the http request HEADERS = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36" } sqlBase = dbExecutor() # creates a sql database handler class todayDateStr = datetime.datetime.now().strftime( "%Y-%m-%d") # today date in the uniform format # creates a session with requests.Session() as s: # set every http/https request to retry max MAX_HTTP_RETRIES retries before returning an error if there is any complications with loading the page s.mount("http://", requests.adapters.HTTPAdapter(max_retries=MAX_HTTP_RETRIES) ) # set max retries to: MAX_HTTP_RETRIES s.mount("https://", requests.adapters.HTTPAdapter(max_retries=MAX_HTTP_RETRIES) ) # set max retries to: MAX_HTTP_RETRIES s.headers.update(HEADERS) # set headers of the session archiveLinks = getCalArchiveLinks(s) # print (archiveLinks) if firstRunBool: logger.debug("Number of found days to check: {}".format( len(archiveLinks))) # while not on the last page for archiveDay, archiveLink in enumerate(archiveLinks): try: archiveLinkPage = archiveLink[0] logger.info("Downloading page: {}".format(archiveLinkPage)) resp = s.get(archiveLinkPage) soup = bs.BeautifulSoup(resp.text, "html.parser") pagesChecked += 1 articles = soup.find("div", class_="row news").find_all( "div", class_="col-sm-4") logger.debug("Number of found articles: {}".format( len(articles))) for article in articles: articlesChecked += 1 title = article.find( "span", class_="h2").text # finds article title link = BASE_URL + article.find("a")[ "href"] # finds article http link date_created = archiveLink[ 1] # finds article date (DATUM_VNOSA) hashStr = makeHash( title, date_created ) # creates article hash from title and dateStr (HASH_VREDNOST) date_created = uniformDateStr(date_created, "%Y-%m-%d") date_downloaded = todayDateStr # date when the article was downloaded # if article is not yet saved in the database we add it if sqlBase.getByHash(hashStr) is None: # get article description/content description = getArticleDescr(s, link) # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string) entry = (date_created, title, description, date_downloaded, hashStr, link, SOURCE_ID) sqlBase.insertOne( entry, True) # insert the article in the database articlesDownloaded += 1 if articlesChecked % 5 == 0: logger.info( "Checked: {} articles. Downloaded: {} new articles." .format(articlesChecked, articlesDownloaded)) # firstRunBool = True # DEBUG! if not firstRunBool and archiveDay >= NUM_DAYS_TO_CHECK: break except Exception: logger.error("Url on which the error occured: {}".format( resp.url)) logger.exception("") sys.exit() logger.info("Downloaded {} new articles.".format(articlesDownloaded))
def get_text(stran, SOURCE_ID): sqlBase = dbExecutor() # creates a sql database handler class todayDateStr = datetime.datetime.now().strftime( "%Y-%m-%d") # today date in the uniform format soup = BeautifulSoup( simple_get("http://pomurske-novice.si/rubrika/pomurje/page/" + str(stran)), "html.parser") all_links = soup.find("div", {"class": "td-ss-main-content"}).find_all("a") tmp = 0 for links in all_links: if (links.get("href") == None): continue if (re.match("http://pomurske-novice.si/+", links.get("href")) and tmp == 0 and not re.match( "http://pomurske-novice.si/rubrika/+", links.get("href"))): print(links.get("href")) try: soup = BeautifulSoup(simple_get(links.get("href")), "html.parser") naslov = soup.find("h1", {"class": "entry-title"}).text.strip() if (soup.find("time") == None): datum = todayDateStr else: try: datum = soup.find("time").text.strip().split() s = "" seq = (datum[1], meseci[datum[0] + ","], datum[2]) datum = uniformDateStr(s.join(seq)) except ValueError as e: datum = todayDateStr vse = soup.find("div", { "class": "td-post-content" }).find_all("p") vsebina = "" for obj in vse: vsebina += str(obj.text).strip() + "\n" link = links.get("href") hashStr = makeHash( naslov, datum ) # creates article hash from title and dateStr (HASH_VREDNOST) date_downloaded = todayDateStr # date when the article was downloaded if sqlBase.getByHash(hashStr) is None: description = vsebina entry = (datum, naslov, description, date_downloaded, hashStr, link, SOURCE_ID) sqlBase.insertOne( entry) # insert the article in the database print("Inserted succesfuly") except TypeError as e: print(e) if (tmp < 2): tmp += 1 else: tmp = 0 print(tmp)
def main(): articlesChecked = 0 # number of checked articles articlesDownloaded = 0 # number of downloaded articles # optionally set headers for the http request HEADERS = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36" } sqlBase = dbExecutor() # creates a sql database handler class todayDateStr = datetime.datetime.now().strftime( "%Y-%m-%d") # today date in the uniform format with requests.Session() as s: # set every http/https request to retry max MAX_HTTP_RETRIES retries before returning an error if there is any complications with loading the page s.mount("http://", requests.adapters.HTTPAdapter(max_retries=MAX_HTTP_RETRIES) ) # set max retries to: MAX_HTTP_RETRIES s.mount("https://", requests.adapters.HTTPAdapter(max_retries=MAX_HTTP_RETRIES) ) # set max retries to: MAX_HTTP_RETRIES s.headers.update(HEADERS) # set headers of the session resp = s.get(POSTS_URL) soup = bs.BeautifulSoup(resp.text, "html.parser") articlesTitle = soup.find("div", class_="main", id="ResizeBody").find_all( "div", class_="blogPostTitle") articlesDate = soup.find("div", class_="main", id="ResizeBody").find_all( "div", class_="blogPostDate") if len(articlesTitle) != len(articlesDate): logger.error( "Page is different: number of title and date <div> tags is different." ) sys.exit() for num in range(len(articlesTitle)): articlesChecked += 1 try: title = articlesTitle[num].find("h2").text link = BASE_URL + articlesTitle[num].find("h2").find( "a")["href"][2:] logger.debug("TITLE: {}\nLINK: {}".format( title.encode("utf-8"), link)) dateStr = articlesDate[num].text.split(" |")[0] hashStr = makeHash(title, dateStr) # print ("title:\n", title, "link:\n", link, "dataStr:\n", dateStr, "hash:\n", hashStr) date_created = uniformDateStr( dateStr, "%d.%m.%y" ) # date when the article was published on the page logger.debug("DATE: {}".format(date_created)) date_downloaded = todayDateStr # date when the article was downloaded # if article is not yet saved in the database we add it if sqlBase.getByHash(hashStr) is None: # get article description/content description = getArticleDescr(s, link) # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string) entry = (date_created, title, description, date_downloaded, hashStr, link, SOURCE_ID) sqlBase.insertOne( entry, True) # insert the article in the database articlesDownloaded += 1 if articlesChecked % 5 == 0: logger.info( "Checked: {} articles. Downloaded: {} new articles.". format(articlesChecked, articlesDownloaded)) if not firstRunBool and articlesChecked >= NUMBER_ARTICLES_TO_CHECK: break except Exception: logger.error("Url on which the error occured: {}".format( resp.url)) logger.exception("") sys.exit() logger.info("Downloaded {} new articles.".format(articlesDownloaded))
def main(): articlesChecked = 0 articlesDownloaded = 0 # number of downloaded articles sqlBase = dbExecutor() # creates a sql database handler class todayDateStr = datetime.datetime.now().strftime("%Y-%m-%d") # today date in the uniform format yearInt = datetime.datetime.now().year # optionally set headers for the http request HEADERS = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36" } with requests.Session() as s: # set every http/https request to retry max MAX_HTTP_RETRIES retries before returning an error if there is any complications with loading the page s.mount("http://", requests.adapters.HTTPAdapter(max_retries = MAX_HTTP_RETRIES)) # set max retries to: MAX_HTTP_RETRIES s.mount("https://", requests.adapters.HTTPAdapter(max_retries = MAX_HTTP_RETRIES)) # set max retries to: MAX_HTTP_RETRIES s.headers.update(HEADERS) # set headers of the session maxYearToCheck = MAX_YEAR-1 if not firstRunBool: maxYearToCheck = yearInt-1 for yearNum in range(yearInt, maxYearToCheck, -1): logger.info("Checking year: {}".format(yearNum)) yearPageLink = POSTS_URL+str(yearNum) try: resp = s.get(yearPageLink) soup = bs.BeautifulSoup(resp.text, "html.parser") articleDates = soup.find("div", id="contentCenterSubjectBlock").find_all("span", class_="contentCenterSubjectBlockDate") articleLinks = soup.find("div", id="contentCenterSubjectBlock").find_all("a") if len(articleDates) != len(articleLinks): logger.error("Page is different: can not find same amount of dates and links.") sys.exit(1) for num in range(len(articleDates)): # print (articleLinks[num]["href"]) articlesChecked += 1 title = articleLinks[num].text link = BASE_URL+"/"+articleLinks[num]["href"] dateStr = articleDates[num].text.strip(" ").strip("\n").strip("\r") hashStr = makeHash(title, dateStr) logger.debug("TITLE: {}".format(title.encode("utf-8"))) logger.debug("TITLE: {}".format(link)) logger.debug("DATE: {}".format(dateStr)) date_created = uniformDateStr(dateStr, "%d.%m.%Y") # date when the article was published on the page date_downloaded = todayDateStr # date when the article was downloaded # if article is not yet saved in the database we add it if sqlBase.getByHash(hashStr) is None: # get article description/content description = getArticleDescr(s, link) # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string) entry = (date_created, title, description, date_downloaded, hashStr, link, SOURCE_ID) sqlBase.insertOne(entry, True) # insert the article in the database articlesDownloaded += 1 if articlesChecked % 5 == 0: logger.info("Checked: {} articles. Downloaded: {} new articles.".format(articlesChecked, articlesDownloaded)) except Exception: logger.error("Url on which the error occured: {}".format(resp.url)) logger.exception("") sys.exit() logger.info("Downloaded {} new articles.".format(articlesDownloaded))
def main(): articlesChecked = 0 articlesDownloaded = 0 # number of downloaded articles sqlBase = dbExecutor() # creates a sql database handler class todayDateStr = datetime.datetime.now().strftime( "%Y-%m-%d") # today date in the uniform format # optionally set headers for the http request HEADERS = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36" } with requests.Session() as s: # set every http/https request to retry max MAX_HTTP_RETRIES retries before returning an error if there is any complications with loading the page s.mount("http://", requests.adapters.HTTPAdapter(max_retries=MAX_HTTP_RETRIES) ) # set max retries to: MAX_HTTP_RETRIES s.mount("https://", requests.adapters.HTTPAdapter(max_retries=MAX_HTTP_RETRIES) ) # set max retries to: MAX_HTTP_RETRIES s.headers.update(HEADERS) # set headers of the session for url in URLS_TO_CHECK: logger.info("Checking page: {}".format(url)) resp = s.get(url) soup = bs.BeautifulSoup(resp.text, "html.parser") articles = soup.find("div", class_="freetext").find_all("li") for article in articles: articlesChecked += 1 try: link = BASE_URL + article.find("a")["href"] title = article.text.strip() hashStr = makeHash(title) date_downloaded = todayDateStr # date when the article was downloaded date_created = None # if article is not yet saved in the database we add it if sqlBase.getByHash(hashStr) is None: # get article description/content description = getArticleDescr(s, link) # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string) entry = (date_created, title, description, date_downloaded, hashStr, link, SOURCE_ID) sqlBase.insertOne( entry, True) # insert the article in the database articlesDownloaded += 1 if articlesChecked % 5 == 0: logger.info( "Checked: {} articles. Downloaded: {} new articles." .format(articlesChecked, articlesDownloaded)) except Exception as e: logger.error("Url on which the error occured: {}".format( resp.url)) logger.exception("") sys.exit() logger.info("Downloaded {} new articles.".format(articlesDownloaded))
def main(): articlesChecked = 0 articlesDownloaded = 0 # number of downloaded articles pageNum = 0 pagesChecked = 0 sqlBase = dbExecutor() # creates a sql database handler class todayDateStr = datetime.datetime.now().strftime("%Y-%m-%d") # today date in the uniform format yearInt = datetime.datetime.now().year # optionally set headers for the http request HEADERS = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36" } with requests.Session() as s: # set every http/https request to retry max MAX_HTTP_RETRIES retries before returning an error if there is any complications with loading the page s.mount("http://", requests.adapters.HTTPAdapter(max_retries = MAX_HTTP_RETRIES)) # set max retries to: MAX_HTTP_RETRIES s.mount("https://", requests.adapters.HTTPAdapter(max_retries = MAX_HTTP_RETRIES)) # set max retries to: MAX_HTTP_RETRIES s.headers.update(HEADERS) # set headers of the session for yearNum in range(yearInt, MAX_YEAR-1, -1): nextPageLink = "" pagelink = BASE_URL+"/"+str(yearNum) while "javascript:void(0);" not in nextPageLink: try: if not firstRunBool and pagesChecked >= NUM_PAGES_TO_CHECK: break pageNum += 1 pagesChecked += 1 resp = s.get(pagelink) soup = bs.BeautifulSoup(resp.text, "html.parser") articles = soup.find_all("article", class_="post") for article in articles: articlesChecked += 1 title = article.find("h2", class_="entry-title").text link = article.find("h2", class_="entry-title").find("a")["href"] dateStr = article.find("time", class_="entry-date").text hashStr = makeHash(title, dateStr) date_created = uniformDateStr(dateStr, "%d.%m.%Y") # date when the article was published on the page date_downloaded = todayDateStr # date when the article was downloaded # if article is not yet saved in the database we add it if sqlBase.getByHash(hashStr) is None: # get article description/content description = getArticleDescr(s, link) # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string) entry = (date_created, title, description, date_downloaded, hashStr, link, SOURCE_ID) sqlBase.insertOne(entry, True) # insert the article in the database articlesDownloaded += 1 if articlesChecked % 5 == 0: logger.info("Checked: {} articles. Downloaded: {} new articles.".format(articlesChecked, articlesDownloaded)) nextPageLink = soup.find("div", class_="pagination loop-pagination").find_all("a")[1]["href"] pagelink = nextPageLink except Exception as e: logger.error("Url on which the error occured: {}".format(resp.url)) logger.exception("") sys.exit() logger.info("Downloaded {} new articles.".format(articlesDownloaded))