def query(keywords, filepath, filename, max_retries=1, idx_unsuccessful=list(), timeout=20): """Handle failed query and handle raised exceptions Input keywords: list with keywords for which to retrieve news max_retries: number of maximum retries until_page: maximum number of retrievd news page Return Inidces where max retries were reached """ # retry until max_retries reached for attempt in range(max_retries): # random int from range around timeout timeout_randomized = randint(timeout - 3, timeout + 3) try: df_result = query_googletrends(keywords) # handle query error except Exception as e: # increase timeout timeout += 5 print(">>> EXCEPTION at {}: {} \n Set timeout to {}\n".format( i, e, timeout)) # sleep h.sleep_countdown(timeout_randomized, print_step=2) # query was successful: store results, sleep else: # generate timestamp for csv stamp = h.timestamp_now() # merge news dataframes and export query results h.make_csv(df_result, filename, filepath, append=True) # sleep h.sleep_countdown(timeout_randomized) break # max_retries reached: store index of unsuccessful query else: h.make_csv(pd.DataFrame(keywords), "unsuccessful_queries.csv", filepath, append=True) print("{} appended to unsuccessful_queries\n".format(keywords))
def query(keywords, filepath, filename, max_retries=1, timeout=22): """Handle failed query and handle raised exceptions Input keywords: list with keywords for which to retrieve news filepath: where csv should be stored filename: name of csv file max_retries: number of maximum retries timeout: target sleep duration within a randomized range of +/-5 seconds Return None: Create csv with query output Create csv of keywords where max retries were reached """ # retry until max_retries reached for attempt in range(max_retries): # random int from range around timeout timeout_randomized = randint(timeout - 3, timeout + 3) try: df_result = query_googletrends(keywords) # handle query error except Exception as e: # increase timeout timeout += 8 print("EXCEPTION for {}: {} \n Set timeout to {}\n".format( keywords, e, timeout)) # sleep h.sleep_countdown(timeout_randomized, print_step=2) # query was successful: store results, sleep else: # generate timestamp for csv stamp = h.timestamp_now() # merge news dataframes and export query results h.make_csv(df_result, filename, filepath, append=True) # sleep h.sleep_countdown(timeout_randomized) break # max_retries reached: store index of unsuccessful query else: h.make_csv(pd.DataFrame(keywords), "unsuccessful_queries.csv", filepath, append=True) print("{} appended to unsuccessful_queries\n".format(keywords))
def main(): # timestamp for filenames stamp = h.timestamp_now() # filenames filename_gtrends = stamp + 'gtrends.csv' filename_meta = stamp + 'gtrends_metadata.csv' ########################################## ## (SX): potential subset of keywords # subset keywords (=esg topic+firm name) with sample_size # for example: 3835 x 261 ~ 1 million rows # minimum: 5 keywords sample_size = 15 #12945 ########################################## # # load keywords as input for API query # # (SX): subset keywords (=esg topic+firm name) with sample_size # changed .iloc[:sample_size, :] to iloc[sample_size:,:] df_query_input = pd.read_csv( '../../data/interim/keywords.csv') # (SX): .iloc[:sample_size,:] # store metadata of query df_query_input['date_query_googletrends'] = datetime.today().strftime( '%Y-%m-%d') h.make_csv(df_query_input, filename_meta, '../../data/raw', header=True) print("Query for {} keywords".format(len(df_query_input))) # create batches of 5 keywords and feed to googletrends query for i in range(0, len(df_query_input) - 4, 5): print("{}:{}/{}".format(i, i + 5, len(df_query_input))) # create batches with 5 keywords kw_batch = [k for k in df_query_input.keyword[i:i + 5]] # feed batch to api query function and store in csv query(keywords=kw_batch, filepath='../../data/raw', filename=filename_gtrends, timeout=21)
def query_yahoofinance(input): """Get financial data from Yahoo! Finance :param input: list of firm tickers for which data should be retrieved :return : None (store returned data as CSV) """ query = Ticker(input) print("Run query .all_financial_data()") try: # query all financial data df_fin_raw = query.all_financial_data() # store raw data stamp = h.timestamp_now() h.make_csv(df_fin_raw, stamp+'yahoofinance.csv', '../../data/raw', header=True, index=True) except Exception as e: print(e)
import pandas as pd import numpy as np import helper_functions as h from datetime import datetime # load esg topics and firm names df_topics = pd.read_csv('../../data/processed/esg_topics.csv') df_sp500 = pd.read_csv('../../data/processed/firm_names.csv') # expand firm names for each topic df_sp500_expanded = df_sp500.iloc[np.repeat(np.arange(len(df_sp500)), len(df_topics))].reset_index(drop=True) # expand topics for each firm df_topics_expanded = df_topics.iloc[list(np.arange(len(df_topics)))*len(df_sp500)].reset_index(drop=True) # generate keywords as a combintation of firm name + topic keywords = pd.DataFrame({'keyword':[i+' '+j for j in df_sp500.firm_name_processed for i in df_topics.topic]}) # merge topics, firm names, and search terms into 1 df df_query_input = pd.concat([df_topics_expanded, df_sp500_expanded, keywords], axis=1).reset_index(drop=True) df_query_input['date_construct_keyword'] = datetime.today().strftime('%Y-%m-%d') # store as csv h.make_csv(df_query_input, 'keywords.csv', '../../data/interim',header=True)
df_fin_recent = df_fin_raw[df_fin_raw.asOfDate.isin(date_most_recent)] # # Missings, clean data # # identify rows with many missings rows_many_missings, _, _ = h.inspect_drop_rows_retain_columns(df_fin_recent, max_missing=4) print("Drop rows:", rows_many_missings[1]) # select rows that cause at least 1 missing for certain columns drop_rows = rows_many_missings[1] # export names to keep track on droppings h.make_csv(drop_rows, "yahoofinance_firms_excluded.csv", data_dir='../../data/interim') # drop rows with many missings df_fin_recent_clean = df_fin_recent.drop(drop_rows) print( f"Exclude {len(drop_rows)} // {len(df_fin_recent_clean)} firms still available\n" ) # inspect missings again df_colmiss = h.inspect_missings(df_fin_recent_clean) # drop columns with missings df_fin_nomiss = df_fin_recent_clean.drop(df_colmiss.index, axis=1) # crosscheck
# define filenames filename_gtrends = stamp + 'gtrends.csv' filename_meta = stamp + 'gtrends_metadata.csv' # set sample size (number of keywords) sample_size = 3835 # minimum: 5 ########################################## # load keywords as input for API query df_query_input = pd.read_csv( '../../data/interim/keywords.csv').iloc[:sample_size, :] # store metadata of query df_query_input['date_query_googletrends'] = datetime.today().strftime( '%Y-%m-%d') h.make_csv(df_query_input, filename_meta, '../../data/raw', header=True) print("Query for {} keywords".format(len(df_query_input))) # create batches of 5 keywords and feed to googletrends query for i in range(0, len(df_query_input) - 4, 5)[:sample_size]: print("{}:{}/{}".format(i, i + 5, sample_size)) # create batches with 5 keywords kw_batch = [k for k in df_query_input.keyword[i:i + 5]] # # feed batch to api query function and store in csv query(keywords=kw_batch, filepath='../../data/raw', filename=filename_gtrends)
topics_negative = [ 'scandal', 'greenwashing', 'corruption', 'fraud', 'bribe', 'tax', 'forced', 'harassment', 'violation', 'illegal', 'conflict', 'weapons', 'pollution', 'inequality', 'discrimination', 'sexism', 'racist', 'intransparent', 'nontransparent', 'breach', 'lawsuit', 'unfair', 'bad', 'problem', 'hate', 'issue', 'controversial', 'strike', 'scam', 'trouble', 'controversy', 'mismanagement', 'crisis', 'turmoil', 'shock', 'whistleblow', 'dispute' ] topics_positive = [ 'green', 'sustainable', 'positive', 'best', 'good', 'social', 'charity', 'ethical', 'renewable', 'carbon neutral', 'equitable', 'ecological', 'efficient', 'improve', 'cooperative', 'beneficial', 'collaborative', 'productive', 'leader', 'donate', 'optimal', 'favorable', 'desirable', 'resilient', 'robust', 'reasonable', 'strong', 'organic' ] print("Defined {} negative and {} positive topics".format( len(topics_negative), len(topics_positive))) # create df with topics and label df_topics_neg = pd.DataFrame({'topic': topics_negative, 'positive': 0}) df_topics_pos = pd.DataFrame({'topic': topics_positive, 'positive': 1}) df_topics = pd.concat([df_topics_neg, df_topics_pos]).reset_index(drop=True) # add definition date df_topics['date_define_topic'] = datetime.today().strftime('%Y-%m-%d') # export csv h.make_csv(df_topics, 'esg_topics.csv', '../../data/interim', header=True)
""" pattern = r"(\s|\.|\,|\&)*(\.com|Enterprise|Worldwide|Int\'l|N\.V\.|LLC|Co\b|Inc\b|Corp\w*|Group\sInc|Group|Company|Holdings\sInc|\WCo(\s|\.)|plc|Ltd|Int'l\.|Holdings|\(?Class\s\w+\)?)\.?\W?" stripped_names = [re.sub(pattern, '', n) for n in raw_names] return stripped_names # get firm S&P500 from Wikipedia keep_columns = ['Symbol', 'Security', 'GICS Sector'] df_sp500_wiki = get_firms_sp500().loc[:, keep_columns] # rename column, set ticker as index df_sp500_wiki = df_sp500_wiki.rename(columns={ 'Symbol': 'ticker', 'Security': 'firm_name_raw', 'GICS Sector': 'sector' }) # process firm names (remove legal entity, suffix) df_sp500_wiki['firm_name_processed'] = regex_strip_legalname( list(df_sp500_wiki.firm_name_raw)) # add retrieval date df_sp500_wiki['date_get_firmname'] = datetime.today().strftime('%Y-%m-%d') # drop duplicate firm names df_sp500_wiki.drop_duplicates(subset='firm_name_processed', inplace=True) h.make_csv(df_sp500_wiki, 'firm_names.csv', '../../data/interim', header=True)