def main(): print('=========================') print(sys.argv[0]) print('=========================') num_new_articles = 0 with requests.Session() as session: session.headers.update(headers) articles = get_articles_on_pages(num_pages_to_check, session) print('\tgathering article info ...') for x in tqdm(articles): title = get_title(x) date = get_date(x) hash_str = make_hash(title, date) if is_article_new(hash_str): link = get_link(x) content = get_content(link, session) new_tup = (str(datetime.date.today()), title, content, formatDate(date), hash_str, link, SOURCE) dbExecutor.insertOne(new_tup) num_new_articles += 1 print(num_new_articles, 'new articles found,', len(articles), 'articles checked,', num_errors, 'errors found\n')
def main(): print('=========================') print(sys.argv[0]) print('=========================') num_new_articles = 0 with requests.Session() as session: session.headers.update(headers) articles = get_articles_on_pages(num_pages_to_check, session) print('\tgathering articles ...') for x in tqdm(articles): title = get_title(x) date = get_date(x) hash_str = make_hash( title, base_url ) #datuma ni na prvi strani, namesto tega hash naredim iz base_url if is_article_new(hash_str): link = get_link(x) r = get_connection(link, session) soup = bs(r.text, 'html.parser') content = get_content(soup) new_tup = (str(datetime.date.today()), title, content, date, hash_str, link, SOURCE) dbExecutor.insertOne(new_tup) num_new_articles += 1 print(num_new_articles, 'new articles found,', len(articles), 'articles checked,', num_errors, 'errors found\n')
def main(): print('=========================') print(sys.argv[0]) print('=========================') num_new_articles = 0 with requests.Session() as session: articles = getArticlesOn_n_pages(num_pages_to_check) print('\tgathering article info') for x in tqdm(articles): title = getTitle(x) date = getDate(x) hash_str = makeHash(title, date) if is_article_new(hash_str): link = getLink(x) content = getContent(link, session) tup = (str(datetime.date.today()), title, content, date, hash_str, link, SOURCE) dbExecutor.insertOne(tup) num_new_articles += 1 print(num_new_articles, 'new articles found', len(articles), 'articles checked,', num_errors, 'errors found')
def main(): print('=========================') print(sys.argv[0]) print('=========================') num_new_articles = 0 with requests.Session() as session: session.headers.update(headers) articles = getArticlesOn_n_pages(num_pages_to_check, session) articles_checked = len(articles) print('\tgathering article info ...') for x in tqdm(articles): title = getTitle(x) date = getDate(x) hash_str = makeHash(title) if is_article_new(hash_str): link = getLink(x) r = get_connection(link, session) soup = bs(r.text, 'html.parser') content = getContent(soup) tup = (str(datetime.date.today()), title, content, formatDate(date), hash_str, link, SOURCE) dbExecutor.insertOne(tup) num_new_articles += 1 print(num_new_articles, 'new articles found,', articles_checked, 'articles checked', num_errors, 'errors found\n')