def getClanki(page_soup,driver): noviClankiHtml = page_soup.findAll("div", class_="art-post-inner art-article") clanki = [] #pridobi nove clanke for clanek in noviClankiHtml: title = getTitle(clanek) date = getDate(clanek) content = getVsebina(clanek) source = my_url hash = makeHash(title, date) #ce je ze v bazi, se ustavi if db.getByHash(hash): return clanki else: clanki.append((str(datetime.date.today()),title,content,date,hash,my_url,source)) #pridobi stare clanke oldClankiLinks = getLinks(page_soup) if oldClankiLinks is NOT_FOUND: pass else: for link in oldClankiLinks: html = loadOldPage(link,driver) clanek = soup(html, "html.parser") title = getTitle(clanek) content = getVsebina(clanek) source = link hash = makeHash(title, date) if db.getByHash(hash): return clanki else: clanki.append((str(datetime.date.today()), title, content, date, hash, my_url, source)) return clanki
def main(): driver = initDriver() html = fullyLoadPage(my_url,driver) i = 0 while html is NOT_FOUND and MAX_HTTP_RETRIES >= i: html = fullyLoadPage(my_url,driver) i+=1 page_soup = soup(html, "html.parser") #vzame vsak clanek try: clanki = page_soup.findAll("div", class_="tl-entry-flex") clanki = filterAds(clanki) novice = [] count = 0 for clanek in clanki: date = getDate(clanek) title = getTitle(clanek) hash = makeHash(title,date) if db.getByHash(hash): break content = getContent(clanek) source = getSource(clanek) count+=1 data = (str(datetime.date.today()),title,content,date,hash,my_url,source) novice.append(data) if len(novice) > 0: db.insertMany(novice) print("Najdenih "+str(count)+" novih clankov") else: print('Ni najdenih novih clankov') driver.close() except: print("Error pri obdelavi clankov")
def getStariClanki(driver): try: driver.get(arhiv_novic_url) WebDriverWait(driver, 5).until( EC.presence_of_element_located((By.CLASS_NAME, 'vsebina'))) time.sleep(3) html = driver.execute_script( "return document.documentElement.outerHTML") page_soup = soup(html, "html.parser") clankihtml = page_soup.find_all('div', class_="novica") clanki = [] for clanek in clankihtml: title = getTitle(clanek) date = getDate(clanek) hash = makeHash(title, date) if db.getByHash(hash): return NOT_FOUND content = getContent(clanek) source = arhiv_novic_url clanek = (str(datetime.date.today()), title, content, date, hash, my_url, source) clanki.append(clanek) if len(clanki) < 1: return NOT_FOUND else: return clanki except: return NOT_FOUND
def getClanek(driver, clanek): try: link = clanek.find("a")["href"] url = my_url + str(link) driver.get(url) try: WebDriverWait(driver, 6).until( EC.visibility_of_element_located( (By.CLASS_NAME, "itemFullText"))) except: return NOT_FOUND html = driver.execute_script( "return document.documentElement.outerHTML") openedClanek = soup(html, "html.parser") title = getTitle(openedClanek) date = getDate(openedClanek) content = getContent(openedClanek) hash = makeHash(title, date) if db.getByHash(hash): return NOT_FOUND source = str(url) novica = (str(datetime.date.today()), title, content, date, hash, my_url, source) return novica except: return NOT_FOUND
def main(): driver = initDriver() html = loadPage(my_url,driver,1) i = 0 #ce se clanki niso uspesno nalozili, probaj max 10krat while i < MAX_HTTP_RETRIES and html is NOT_FOUND: html = loadPage(my_url,driver,1) i+=1 NOVICE = [] STEVILO_VSEH_STRANI = getSteviloVsehStrani(my_url,driver) ''' Trenutno gre skozi vse članke, če pride do že obstoječega, se ustavi Za testiranje najboljše, da zamenjaš STEVILO_VSEH_STRANI z neko malo cifro, da ne naloada vseh clankov, ker jih je ogromno ''' for x in range(1,3): i = 0 html = loadPage(my_url, driver, x) while i < MAX_HTTP_RETRIES and html is NOT_FOUND: html = loadPage(my_url,driver,x) i+=1 page_soup = soup(html, "html.parser") clanki = page_soup.find("ul", class_="articles").findAll("li", class_="item bigger") count = 0 # print("PAGE "+str(x)+"**************************") done = False for clanek in clanki: title = getTitle(clanek) content = getContent(clanek) date = getDate(clanek) source = getSource(clanek) hash = makeHash(title, date) if content is NOT_FOUND and title is NOT_FOUND: continue if db.getByHash(hash): done = True break else: data = (str(datetime.date.today()), title, content, date, hash, my_url, source) NOVICE.append(data) # print("Datum: "+str(date)) # print("Naslov: "+str(title)) # print("Vsebina: "+str(content)) # print("Source: "+str(source)) # print("Hash: "+str(hash)) # print("-------------------------------------------------------") count += 1 if done: break db.insertMany(NOVICE) # print(count) # print("STEVILO_VSEH_STRANI: "+str(STEVILO_VSEH_STRANI)) driver.close()
def getClanek(clanekHtml): try: title = getTitle(clanekHtml) date = getDate(clanekHtml) hash = makeHash(title, date) if db.getByHash(hash): return NOT_FOUND content = getContent(clanekHtml) source = getSource(clanekHtml) clanek = (str(datetime.date.today()), title, content, date, hash, my_url, source) return clanek except: return NOT_FOUND
def getClanek(driver, link): try: driver.get(link) time.sleep(3) html = driver.execute_script( "return document.documentElement.outerHTML") openedClanek = soup(html, "html.parser") date = getDate(openedClanek) title = getTitle(openedClanek) content = getContent(openedClanek) source = str(link) hash = makeHash(title, date) if db.getByHash(hash): return NOT_FOUND clanek = (str(datetime.date.today()), title, content, date, hash, my_url, source) return clanek except: return NOT_FOUND
def getClanki(page_soup): try: clanki = page_soup.find_all('article', {"data-tpl": "content"}) novice = [] for clanek in clanki: title = getTitle(clanek) date = getDate(clanek) hash = makeHash(title, date) if db.getByHash(hash): break content = getContent(clanek) source = getSource(clanek) novica = (str(datetime.date.today()), title, content, date, hash, my_url, source) novice.append(novica) if len(novice) < 1: return NOT_FOUND else: return novice except: return NOT_FOUND
def getNoviClanki(page_soup): try: noviClankiHtml = page_soup.findAll("div", class_="novica") date = "" title = "" content = "" source = "" hash = "" clanki = [] for clanek in noviClankiHtml: date = getDate(clanek) try: title = str(clanek.find('h2').find('a').text).strip() except: title = NOT_FOUND try: content = getContent(clanek) except: content = NOT_FOUND hash = makeHash(title, date) if db.getByHash(hash): return NOT_FOUND try: source = my_url + str(clanek.find('h2').find('a')['href']) except: source = NOT_FOUND novica = (str(datetime.date.today()), title, content, date, hash, my_url, source) clanki.append(novica) if len(clanki) < 1: return NOT_FOUND else: return clanki except: return NOT_FOUND
def is_article_new(hash_str): if dbExecutor.getByHash(hash_str): return False return True
def is_article_new(hash_str): if dbExecutor.getByHash(hash_str): return False print('new article found') return True