def get_data_news(exportCSV=True): config = get_config() for newspaper in config["newspaperlist"]: newspaper_dict = config["newspaperlist"][newspaper] details_dict = config["newspaperlist"][newspaper]["XPATH_news_details"] links = get_links(newspaper_dict) news_urls = create_urlnews(newspaper_dict, links) arraynews = [] arraynews_stats = [] for news_url in news_urls: news_details_dict = get_details_dict(details_dict, news_url) if news_details_dict: arraynews.append(news_details_dict) arraynews_stats.append(get_news_stats(news_details_dict)) dfnews = pd.DataFrame(arraynews) dfnews.dropna(inplace=True) dfstats = pd.DataFrame(arraynews_stats) dfstats.dropna(inplace=True) if exportCSV: dfnews.to_csv( f"News {config['newspaperlist'][newspaper]['name']} at {datetime.now().strftime(' %Y, %m, %d %H-%M-%S')}.csv", encoding='utf-8-sig', index=False) dfstats.to_csv( f"Stats {config['newspaperlist'][newspaper]['name']} at {datetime.now().strftime(' %Y, %m, %d %H-%M-%S')}.csv", encoding='utf-8-sig', index=False) return arraynews, arraynews_stats
def test_eOnline_regex(): browser = make_browser() stories = scraper.get_links(browser, sites['eOnline']['url'], sites['eOnline']['link_regex']) assert len(stories) > 0
def test_nyTimes_regex(): browser = make_browser() stories = scraper.get_links(browser, sites['nyTimes']['url'], sites['nyTimes']['link_regex']) assert len(stories) > 0
def test_guardian_regex(): browser = make_browser() stories = scraper.get_links(browser, sites['guardian']['url'], sites['guardian']['link_regex']) assert len(stories) > 0
def test_nola_regex(): stories = scraper.get_links(make_browser(), sites['nola']['url'], sites['nola']['link_regex']) assert len(stories) > 0