# let it wait in-between clicks # driver.implicitly_wait(2) except Exception as e: print( f'Exception {e} \noccured when loading more articles in the politics section in Kathimerini.' ) continue news_soup = BeautifulSoup(driver.page_source, 'html.parser') article_links = [ link.split(">")[0] for link in str( news_soup.find_all( 'div', class_='article_thumbnail_wrapper')).split('href="')[1:] ] return article_links if __name__ == "__main__": article_links = kathimerini_article_links() links_df = pd.DataFrame(article_links) links_df.to_csv('data/kathimerini_links.csv', index=False) save_articles_in_parts(links_df, article_parser=kathimerini_article_parser, media_name='kathimerini')
# searching in a predefined range, to get links of up to a year back for page_id in tqdm(range(360), total=360): try: news_link = 'https://kontranews.gr/politiki?page=' + str(page_id) response = get(news_link) if response.status_code == 200: news_soup = BeautifulSoup(response.text, 'html.parser') article_links += ['https://kontranews.gr/' + str(link).split('href=')[1].split('"')[1] for link in news_soup.find_all('div', class_='post-link hidden')] else: break except Exception as e: print(e) return article_links if __name__ == "__main__": article_links = kontra_article_links() links_df = pd.DataFrame(article_links) links_df.to_csv('data/kontra_links.csv', index=False) save_articles_in_parts(links_df, article_parser=kontra_article_parser, media_name='kontra')
try: driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i)) i += 1 time.sleep(scroll_pause_time) # update scroll height each time after scrolled, as the scroll height can change after we scrolled the page # scroll_height = driver.execute_script("return document.body.scrollHeight;") # pracrtically scroll until no more scroll is possible except Exception as e: print(f'Exception {e} \noccured when loading more articles in the politics section in ProtoThema.') continue # create a beautifoul soup from the articles loaded in the page news_soup = BeautifulSoup(driver.page_source, 'html.parser') article_links = [str(link).split('href="')[1].split('">')[0] for link in news_soup.find_all('span', class_='update_well') if '2020' in str(link).split('" title="')[1].split(',')[0] and 'href' in str(link)] return article_links if __name__ == "__main__": article_links = protothema_article_links() links_df = pd.DataFrame(article_links) links_df.to_csv('data/protothema_links.csv', index=False) save_articles_in_parts(links_df, article_parser=protothema_article_parser, media_name='protothema')
try: # the link is for search window news_link = 'https://www.skai.gr/s/%CE%A0%CE%9F%CE%9B%CE%99%CE%A4%CE%99%CE%9A%CE%97?page=' + str(page_id) response = get(news_link) if response.status_code == 200: news_soup = BeautifulSoup(response.text, 'html.parser') # not all articles returned are for politics, we keep only those article_links += [link['href'] for link in news_soup.find_all('a', class_='title mainLink', href=True) if 'politics' in link['href']] else: break except Exception as e: print(e) return article_links if __name__ == "__main__": article_links = skai_article_links() links_df = pd.DataFrame(article_links) links_df.to_csv('data/skai_links.csv', index=False) save_articles_in_parts(links_df, article_parser=skai_article_parser, media_name='skai')
article_links = [] # hardcoded value, we want to keep only data for the past year for page_id in tqdm(range(360), total=360): try: news_link = 'https://www.tovima.gr/category/politics/page/' + str(page_id) response = get(news_link) if response.status_code == 200: news_soup = BeautifulSoup(response.text, 'html.parser') article_links += [link['href'] for link in news_soup.find_all('a', class_='zonabold twenty black-c article-main', href=True)] else: break except Exception as e: print(e) return article_links if __name__ == "__main__": article_links = tobhma_article_links() links_df = pd.DataFrame(article_links) links_df.to_csv('data/tobhma_links.csv', index=False) save_articles_in_parts(links_df, article_parser=tobhma_article_parser, media_name='tobhma')
response = get(ethnos_news_by_page) if response.status_code == 503: print(f'Broken page: {page_id}') # the page is broken broken_pages += [page_id] continue news_soup = BeautifulSoup(response.text, 'html.parser') ethnos_links = [ 'https://www.ethnos.gr' + link['href'] for link in news_soup.find_all('a', class_='full-link', href=True)[1:] ] if ethnos_links != []: article_links += ethnos_links else: break except Exception as e: print(e) return article_links, broken_pages if __name__ == "__main__": article_links, _ = ethnos_article_links() links_df = pd.DataFrame(article_links) links_df.to_csv('data/ethnos_links.csv', index=False) save_articles_in_parts(links_df, article_parser=ethnos_article_parser, media_name='ethnos')
try: response = get(tanea_news_link) if response.status_code == 200: news_soup = BeautifulSoup(response.text, 'html.parser') article_links += [ link['href'] for link in news_soup.find_all( 'a', class_='article-title-18 dark-c firamedium nodecor', href=True) ] page_id += 1 else: break except Exception as e: print(e) return article_links if __name__ == "__main__": article_links = tanea_article_links() links_df = pd.DataFrame(article_links) links_df.to_csv('data/tanea_links.csv', index=False) save_articles_in_parts(links_df, article_parser=tanea_article_parser, media_name='tanea')
try: tanea_news_link = 'https://www.efsyn.gr/politiki?page=' + str( page_id) response = get(tanea_news_link) if response.status_code == 200: news_soup = BeautifulSoup(response.text, 'html.parser') article_links += [ 'https://www.efsyn.gr' + link['href'] for link in news_soup.find_all('a', class_='full-link', href=True)[:12] ] else: break except Exception as e: print(e) return article_links if __name__ == "__main__": article_links = efsyn_article_links() links_df = pd.DataFrame(article_links) links_df.to_csv('data/efsyn_links.csv', index=False) save_articles_in_parts(links_df, article_parser=efsyn_article_parser, media_name='efsyn')