def main(): num_pages_to_check = 1 num_new_articles = 0 articles_checked = 0 with requests.Session() as session: session.headers.update(headers) articles = get_articles_on_pages(num_pages_to_check,session) articles_checked = len(articles) new_articles_tuples = [] for x in articles: title = get_title(x) hash_str = make_hash(title, base_url) #datuma ni na prvi strani, namesto tega hash naredim iz base_url if is_article_new(hash_str): link = get_link(x) r = session.get(link, timeout=8) soup = bs(r.text, 'html.parser') date = get_date(soup) content = get_content(soup) print(link + '\n') new_tup = (str(datetime.date.today()), title, content, format_date(date), hash_str, link, SOURCE) new_articles_tuples.append(new_tup) num_new_articles += 1 #add new articles to database dbExecutor.insertMany(new_articles_tuples) print(num_new_articles, 'new articles found,', articles_checked,'articles checked')
def main(): print('=========================') print(sys.argv[0]) print('=========================') num_new_articles = 0 with requests.Session() as session: session.headers.update(headers) articles = getArticlesOnPage(num_articles_to_check, session) articles_tuples = [] print('\tgathering article info ...') for x in tqdm(articles): title = getTitle(x) date = getDate(x) hash_str = makeHash(title, date) if is_article_new(hash_str): link = getLink(x) content = getContent(link, session) tup = (str(datetime.date.today()), title, content, formatDate(date), hash_str, link, SOURCE) articles_tuples.append(tup) num_new_articles += 1 dbExecutor.insertMany(articles_tuples) print(num_new_articles, 'new articles found,', len(articles), 'articles checked,', num_errors, 'errors\n')
def main(): num_new_articles = 0 num_pages_to_check = 2 with requests.Session() as session: articles = getArticlesOn_n_pages(num_pages_to_check) titles = [] dates = [] links = [] hashes = [] for x in articles: title = getTitle(x) date = getDate(x) hash_str = makeHash(title, date) if isArticleNew(hash_str): titles.append(title) dates.append(date) hashes.append(hash_str) links.append(getLink(x)) num_new_articles += 1 list_new = [] for i in range(num_new_articles): content = getContent(links[i], session) tup = (str(datetime.date.today()), titles[i], content, formatDate(dates[i]), hashes[i], links[i], base_url) list_new.append(tup) dbExecutor.insertMany(list_new)
def main(): num_pages_to_check = 2 num_new_articles = 0 articles_checked = 0 with requests.Session() as session: session.headers.update(headers) articles = get_articles_on_pages(num_pages_to_check, session) articles_checked = len(articles) new_articles_tuples = [] for x in articles: title = get_title(x) date = get_date(x) hash_str = make_hash(title, date) if is_article_new(hash_str): link = get_link(x) r = requests.get(link) soup = bs(r.text, 'html.parser') content = get_content(soup) print(link + '\n') new_tup = (str(datetime.date.today()), title, content, date, hash_str, link, base_url) new_articles_tuples.append(new_tup) num_new_articles += 1 #add new articles to database dbExecutor.insertMany(new_articles_tuples) print(num_new_articles, 'new articles found,', articles_checked, 'articles checked')
def main(): driver = initDriver() html = fullyLoadPage(my_url,driver) i = 0 while html is NOT_FOUND and MAX_HTTP_RETRIES >= i: html = fullyLoadPage(my_url,driver) i+=1 page_soup = soup(html, "html.parser") #vzame vsak clanek try: clanki = page_soup.findAll("div", class_="tl-entry-flex") clanki = filterAds(clanki) novice = [] count = 0 for clanek in clanki: date = getDate(clanek) title = getTitle(clanek) hash = makeHash(title,date) if db.getByHash(hash): break content = getContent(clanek) source = getSource(clanek) count+=1 data = (str(datetime.date.today()),title,content,date,hash,my_url,source) novice.append(data) if len(novice) > 0: db.insertMany(novice) print("Najdenih "+str(count)+" novih clankov") else: print('Ni najdenih novih clankov') driver.close() except: print("Error pri obdelavi clankov")
def main(): num_pages_to_check = 2 num_new_articles = 0 with requests.Session() as session: session.headers.update(headers) articles = getArticlesOn_n_pages(num_pages_to_check, session) articles_checked = len(articles) dates = [] titles = [] hashes = [] links = [] for x in articles: title = getTitle(x) date = getDate(x) hash = makeHash(title, date) if isArticleNew(hash): titles.append(title) dates.append(date) hashes.append(hash) links.append(getLink(x)) num_new_articles += 1 list_of_tuples = [] for i in range(len(links)): content = getContent(links[i], session) tup = (str(datetime.date.today()), titles[i], content, dates[i], hashes[i], links[i], base_url) list_of_tuples.append(tup) dbExecutor.insertMany(list_of_tuples) print(num_new_articles, 'new articles found,', num_pages_to_check,'pages checked -', articles_checked, 'articles checked')
def main(): num_articles_to_check = 20 num_new_articles = 0 with requests.Session() as session: session.headers.update(headers) articles = getArticlesOn_n_pages(num_articles_to_check, session) new_articles_tuples = [] for x in articles: title = getTitle(x) date = getDate(x) hash_str = makeHash(title, date) if isArticleNew(hash_str): link = getLink(x) content = getContent(link, session) if not content: content = x.find('p').text num_new_articles += 1 new_articles_tuples.append((str(datetime.date.today()), title, content, formatDate(date), hash_str, link, base_url)) dbExecutor.insertMany(new_articles_tuples) print(num_new_articles, 'new articles found,', num_articles_to_check, 'articles checked')
def main(): driver = initDriver() clanki = getClanki(driver) if clanki != NOT_FOUND: db.insertMany(clanki) else: print('Ni najdenih novih clankov') driver.close()
def main(): driver = initDriver() html = loadPage(my_url,driver,1) i = 0 #ce se clanki niso uspesno nalozili, probaj max 10krat while i < MAX_HTTP_RETRIES and html is NOT_FOUND: html = loadPage(my_url,driver,1) i+=1 NOVICE = [] STEVILO_VSEH_STRANI = getSteviloVsehStrani(my_url,driver) ''' Trenutno gre skozi vse članke, če pride do že obstoječega, se ustavi Za testiranje najboljše, da zamenjaš STEVILO_VSEH_STRANI z neko malo cifro, da ne naloada vseh clankov, ker jih je ogromno ''' for x in range(1,3): i = 0 html = loadPage(my_url, driver, x) while i < MAX_HTTP_RETRIES and html is NOT_FOUND: html = loadPage(my_url,driver,x) i+=1 page_soup = soup(html, "html.parser") clanki = page_soup.find("ul", class_="articles").findAll("li", class_="item bigger") count = 0 # print("PAGE "+str(x)+"**************************") done = False for clanek in clanki: title = getTitle(clanek) content = getContent(clanek) date = getDate(clanek) source = getSource(clanek) hash = makeHash(title, date) if content is NOT_FOUND and title is NOT_FOUND: continue if db.getByHash(hash): done = True break else: data = (str(datetime.date.today()), title, content, date, hash, my_url, source) NOVICE.append(data) # print("Datum: "+str(date)) # print("Naslov: "+str(title)) # print("Vsebina: "+str(content)) # print("Source: "+str(source)) # print("Hash: "+str(hash)) # print("-------------------------------------------------------") count += 1 if done: break db.insertMany(NOVICE) # print(count) # print("STEVILO_VSEH_STRANI: "+str(STEVILO_VSEH_STRANI)) driver.close()
def main(): driver = initDriver() html = fullyLoadPage(my_url, driver) i = 0 while i < MAX_HTTP_RETRIES and html is NOT_FOUND: html = fullyLoadPage(my_url, driver) i += 1 page_soup = soup(html, "html.parser") clanki = getClanki(page_soup,driver) db.insertMany(clanki) driver.close()
def main(): driver = initDriver() html = fullyLoadPage(my_url, driver) page_soup = soup(html, "html.parser") clanki = page_soup.findAll("article") NOVICE = [] for clanek in clanki: novica = getClanek(driver, clanek) if novica is not NOT_FOUND: NOVICE.append(novica) db.insertMany(NOVICE) driver.close()
def main(): driver = initDriver() html = loadPage(driver) i = 0 while i < MAX_HTTP_RETRIES and html is NOT_FOUND: html = loadPage(driver) i += 1 page_soup = soup(html, "html.parser") clanki = getClanki(page_soup) if clanki != NOT_FOUND: db.insertMany(clanki) else: print('Ni najdenih novih clankov') driver.close()
def main(): print('=========================') print(sys.argv[0]) print('=========================') num_new_articles = 0 num_pages_to_check = 3 driver = initDriver() articles = getArticlesOn_n_pages(num_pages_to_check, driver) driver.quit() titles = [] dates = [] links = [] hashes = [] print('\tgathering article info ...') for x in tqdm(articles): title = getTitle(x) date = getDate(x) hash_str = makeHash(title, date) if is_article_new(hash_str): titles.append(title) dates.append(date) hashes.append(hash_str) links.append(getLink(x)) num_new_articles += 1 new_articles_tuples = [] driver = initDriver() print('\tgathering article content ...') for i in tqdm(range(num_new_articles)): content = getContent(links[i], driver) new_articles_tuples.append( (str(datetime.date.today()), titles[i], content, formatDate(dates[i]), hashes[i], links[i], SOURCE)) # time.sleep(2) driver.quit() dbExecutor.insertMany(new_articles_tuples) print(num_new_articles, 'new articles found,', len(articles), 'articles checked,', num_errors, 'errors found\n')
def main(): driver = initDriver() html = fullyLoadPage(my_url, driver) i = 0 while i < MAX_HTTP_RETRIES and html is NOT_FOUND: html = fullyLoadPage(my_url, driver) i += 1 page_soup = soup(html, "html.parser") clanki = page_soup.findAll("div", class_="card_article") links = getLinks(clanki) NOVICE = [] for link in links: novica = getClanek(driver, link) if novica is not NOT_FOUND: NOVICE.append(novica) db.insertMany(NOVICE) driver.close()
def main(): driver = initDriver() html = loadFirstPage(my_url, driver) i = 0 while i < MAX_HTTP_RETRIES and html is NOT_FOUND: html = loadFirstPage(my_url, driver) i += 1 page_soup = soup(html, "html.parser") noviClanki = getNoviClanki(page_soup) stariClanki = getStariClanki(driver) if noviClanki != NOT_FOUND and stariClanki != NOT_FOUND: vsiClanki = noviClanki + stariClanki db.insertMany(vsiClanki) else: print('Ni najdenih novih clankov') driver.close()
def main(): num_new_articles = 0 num_pages_to_check = 3 driver = initDriver() articles = getArticlesOn_n_pages(num_pages_to_check, driver) driver.quit() titles = [] dates = [] links = [] hashes = [] for x in articles: title = getTitle(x) date = getDate(x) hash_str = makeHash(title, date) if isArticleNew(hash_str): titles.append(title) dates.append(date) hashes.append(hash_str) links.append(getLink(x)) num_new_articles += 1 new_articles_tuples = [] driver = initDriver() for i in range(num_new_articles): content = getContent(links[i], driver) new_articles_tuples.append( (str(datetime.date.today()), titles[i], content, formatDate(dates[i]), hashes[i], links[i], base_url)) # time.sleep(2) driver.quit() dbExecutor.insertMany(new_articles_tuples) print(num_new_articles, 'new articles found', num_pages_to_check, 'pages checked')
def main(): num_pages_to_check = 1 num_new_articles = 0 with requests.Session() as session: session.headers.update(headers) articles = getArticlesOn_n_pages(num_pages_to_check, session) articles_checked = len(articles) titles = [] hashes = [] links = [] for x in articles: title = getTitle(x) hash_str = makeHash(title) if is_article_new(hash_str): titles.append(title) hashes.append(hash_str) links.append(getLink(x)) num_new_articles += 1 list_of_tuples = [] for i in range(len(links)): print(links[i]) r = session.get(links[i], timeout=10) soup = BeautifulSoup(r.text, 'html.parser') content = getContent(soup) date = getDate(soup) tup = (str(datetime.date.today()), titles[i], content, formatDate(date), hashes[i], links[i], base_url) list_of_tuples.append(tup) dbExecutor.insertMany(list_of_tuples) print(num_new_articles, 'new articles found,', num_pages_to_check, 'pages checked -', articles_checked, 'articles checked')
def main(): num_pages_to_check = 3 num_new_articles = 0 with requests.Session() as session: session.headers.update(headers) articles = getArticlesOnPage(num_pages_to_check, session) dates = [] titles = [] hashes = [] links = [] for x in articles: title = getTitle(x) date = getDate(x) hash = makeHash(title, date) if isArticleNew(hash): titles.append(title) dates.append(date) hashes.append(hash) links.append(getLink(x)) num_new_articles += 1 new_articles_tuples = [] for i in range(len(links)): #tukaj popravi, da vneses v bazo content = ' '.join(getContent(links[i], session).split()) tup = (str(datetime.date.today()), titles[i], content, formatDate(dates[i]), hashes[i], links[i], base_url) new_articles_tuples.append(tup) dbExecutor.insertMany(new_articles_tuples) print(num_new_articles, 'new articles found,', num_pages_to_check, 'pages checked')
def main(): num_pages_to_check = 1 num_new_articles = 0 articles_checked = 0 with requests.Session() as session: session.headers.update(headers) if firstRunBool: maxPageNum = getMaxPageNum(session) print("Checking {} pages".format(maxPageNum)) num_pages_to_check = maxPageNum articles = get_articles_on_pages(num_pages_to_check, session) articles_checked = len(articles) new_articles_tuples = [] for x in articles: title = get_title(x) date = get_date(x) hash_str = make_hash(title, date) if is_article_new(hash_str): link = get_link(x) r = session.get(link, timeout=8) soup = bs(r.text, 'html.parser') content = get_content(soup) print(link + '\n') new_tup = (str(datetime.date.today()), title, content, date, hash_str, link, SOURCE) new_articles_tuples.append(new_tup) num_new_articles += 1 #add new articles to database dbExecutor.insertMany(new_articles_tuples) print(num_new_articles, 'new articles found,', articles_checked, 'articles checked')