Example #1
0
def query(keywords,
          filepath,
          filename,
          max_retries=1,
          idx_unsuccessful=list(),
          timeout=20):
    """Handle failed query and handle raised exceptions
    
    Input
        keywords: list with keywords for which to retrieve news
        max_retries: number of maximum retries
        until_page: maximum number of retrievd news page
        
    
    Return
        Inidces where max retries were reached
    """
    # retry until max_retries reached
    for attempt in range(max_retries):

        # random int from range around timeout
        timeout_randomized = randint(timeout - 3, timeout + 3)

        try:
            df_result = query_googletrends(keywords)

        # handle query error
        except Exception as e:

            # increase timeout
            timeout += 5

            print(">>> EXCEPTION at {}: {} \n Set timeout to {}\n".format(
                i, e, timeout))
            # sleep
            h.sleep_countdown(timeout_randomized, print_step=2)

        # query was successful: store results, sleep
        else:

            # generate timestamp for csv
            stamp = h.timestamp_now()

            # merge news dataframes and export query results
            h.make_csv(df_result, filename, filepath, append=True)

            # sleep
            h.sleep_countdown(timeout_randomized)
            break

    # max_retries reached: store index of unsuccessful query
    else:
        h.make_csv(pd.DataFrame(keywords),
                   "unsuccessful_queries.csv",
                   filepath,
                   append=True)
        print("{} appended to unsuccessful_queries\n".format(keywords))
Example #2
0
def query(keywords, filepath, filename, max_retries=1, timeout=22):
    """Handle failed query and handle raised exceptions
    
    Input
        keywords: list with keywords for which to retrieve news
        filepath: where csv should be stored
        filename: name of csv file
        max_retries: number of maximum retries
      	timeout: target sleep duration within a randomized range of +/-5 seconds
        
    
    Return
        None: 	Create csv with query output
        		Create csv of keywords where max retries were reached

    """
    # retry until max_retries reached
    for attempt in range(max_retries):

        # random int from range around timeout
        timeout_randomized = randint(timeout - 3, timeout + 3)

        try:
            df_result = query_googletrends(keywords)

        # handle query error
        except Exception as e:

            # increase timeout
            timeout += 8

            print("EXCEPTION for {}: {} \n Set timeout to {}\n".format(
                keywords, e, timeout))
            # sleep
            h.sleep_countdown(timeout_randomized, print_step=2)

        # query was successful: store results, sleep
        else:

            # generate timestamp for csv
            stamp = h.timestamp_now()

            # merge news dataframes and export query results
            h.make_csv(df_result, filename, filepath, append=True)

            # sleep
            h.sleep_countdown(timeout_randomized)
            break

    # max_retries reached: store index of unsuccessful query
    else:
        h.make_csv(pd.DataFrame(keywords),
                   "unsuccessful_queries.csv",
                   filepath,
                   append=True)
        print("{} appended to unsuccessful_queries\n".format(keywords))
Example #3
0
def main():

    # timestamp for filenames
    stamp = h.timestamp_now()

    # filenames
    filename_gtrends = stamp + 'gtrends.csv'
    filename_meta = stamp + 'gtrends_metadata.csv'

    ##########################################
    ## (SX): potential subset of keywords
    # subset keywords (=esg topic+firm name) with sample_size
    # for example: 3835 x 261 ~ 1 million rows
    # minimum: 5 keywords
    sample_size = 15  #12945
    ##########################################

    #
    # load keywords as input for API query
    #

    # (SX): subset keywords (=esg topic+firm name) with sample_size
    # changed .iloc[:sample_size, :] to iloc[sample_size:,:]
    df_query_input = pd.read_csv(
        '../../data/interim/keywords.csv')  # (SX): .iloc[:sample_size,:]

    # store metadata of query
    df_query_input['date_query_googletrends'] = datetime.today().strftime(
        '%Y-%m-%d')
    h.make_csv(df_query_input, filename_meta, '../../data/raw', header=True)

    print("Query for {} keywords".format(len(df_query_input)))

    # create batches of 5 keywords and feed to googletrends query
    for i in range(0, len(df_query_input) - 4, 5):

        print("{}:{}/{}".format(i, i + 5, len(df_query_input)))

        # create batches with 5 keywords
        kw_batch = [k for k in df_query_input.keyword[i:i + 5]]

        # feed batch to api query function and store in csv
        query(keywords=kw_batch,
              filepath='../../data/raw',
              filename=filename_gtrends,
              timeout=21)
def query_yahoofinance(input):
	"""Get financial data from Yahoo! Finance

	:param input: list of firm tickers for which data should be retrieved
	:return : None (store returned data as CSV)

	"""
	query = Ticker(input)

	print("Run query .all_financial_data()")

	try:
		# query all financial data
		df_fin_raw = query.all_financial_data()
		# store raw data
		stamp = h.timestamp_now()
		h.make_csv(df_fin_raw, stamp+'yahoofinance.csv', '../../data/raw', header=True, index=True)
	except Exception as e:
		print(e)
Example #5
0
import pandas as pd
import numpy as np
import helper_functions as h 
from datetime import datetime


# load esg topics and firm names
df_topics = pd.read_csv('../../data/processed/esg_topics.csv')
df_sp500 = pd.read_csv('../../data/processed/firm_names.csv')

# expand firm names for each topic
df_sp500_expanded = df_sp500.iloc[np.repeat(np.arange(len(df_sp500)), len(df_topics))].reset_index(drop=True)

# expand topics for each firm
df_topics_expanded = df_topics.iloc[list(np.arange(len(df_topics)))*len(df_sp500)].reset_index(drop=True)

# generate keywords as a combintation of firm name + topic
keywords = pd.DataFrame({'keyword':[i+' '+j for j in df_sp500.firm_name_processed for i in df_topics.topic]})

# merge topics, firm names, and search terms into 1 df
df_query_input = pd.concat([df_topics_expanded, df_sp500_expanded, keywords], axis=1).reset_index(drop=True)
df_query_input['date_construct_keyword'] = datetime.today().strftime('%Y-%m-%d')

# store as csv
h.make_csv(df_query_input, 'keywords.csv', '../../data/interim',header=True)


df_fin_recent = df_fin_raw[df_fin_raw.asOfDate.isin(date_most_recent)]

#
# Missings, clean data
#

# identify rows with many missings
rows_many_missings, _, _ = h.inspect_drop_rows_retain_columns(df_fin_recent,
                                                              max_missing=4)
print("Drop rows:", rows_many_missings[1])

# select rows that cause at least 1 missing for certain columns
drop_rows = rows_many_missings[1]
# export names to keep track on droppings
h.make_csv(drop_rows,
           "yahoofinance_firms_excluded.csv",
           data_dir='../../data/interim')

# drop rows with many missings
df_fin_recent_clean = df_fin_recent.drop(drop_rows)
print(
    f"Exclude {len(drop_rows)} // {len(df_fin_recent_clean)} firms still available\n"
)

# inspect missings again
df_colmiss = h.inspect_missings(df_fin_recent_clean)

# drop columns with missings
df_fin_nomiss = df_fin_recent_clean.drop(df_colmiss.index, axis=1)

# crosscheck
Example #7
0
# define filenames
filename_gtrends = stamp + 'gtrends.csv'
filename_meta = stamp + 'gtrends_metadata.csv'

# set sample size (number of keywords)
sample_size = 3835  # minimum: 5
##########################################

# load keywords as input for API query
df_query_input = pd.read_csv(
    '../../data/interim/keywords.csv').iloc[:sample_size, :]

# store metadata of query
df_query_input['date_query_googletrends'] = datetime.today().strftime(
    '%Y-%m-%d')
h.make_csv(df_query_input, filename_meta, '../../data/raw', header=True)

print("Query for {} keywords".format(len(df_query_input)))

# create batches of 5 keywords and feed to googletrends query
for i in range(0, len(df_query_input) - 4, 5)[:sample_size]:

    print("{}:{}/{}".format(i, i + 5, sample_size))

    # create batches with 5 keywords
    kw_batch = [k for k in df_query_input.keyword[i:i + 5]]

    # # feed batch to api query function and store in csv
    query(keywords=kw_batch,
          filepath='../../data/raw',
          filename=filename_gtrends)
topics_negative = [
    'scandal', 'greenwashing', 'corruption', 'fraud', 'bribe', 'tax', 'forced',
    'harassment', 'violation', 'illegal', 'conflict', 'weapons', 'pollution',
    'inequality', 'discrimination', 'sexism', 'racist', 'intransparent',
    'nontransparent', 'breach', 'lawsuit', 'unfair', 'bad', 'problem', 'hate',
    'issue', 'controversial', 'strike', 'scam', 'trouble', 'controversy',
    'mismanagement', 'crisis', 'turmoil', 'shock', 'whistleblow', 'dispute'
]

topics_positive = [
    'green', 'sustainable', 'positive', 'best', 'good', 'social', 'charity',
    'ethical', 'renewable', 'carbon neutral', 'equitable', 'ecological',
    'efficient', 'improve', 'cooperative', 'beneficial', 'collaborative',
    'productive', 'leader', 'donate', 'optimal', 'favorable', 'desirable',
    'resilient', 'robust', 'reasonable', 'strong', 'organic'
]

print("Defined {} negative and {} positive topics".format(
    len(topics_negative), len(topics_positive)))

# create df with topics and label
df_topics_neg = pd.DataFrame({'topic': topics_negative, 'positive': 0})
df_topics_pos = pd.DataFrame({'topic': topics_positive, 'positive': 1})
df_topics = pd.concat([df_topics_neg, df_topics_pos]).reset_index(drop=True)

# add definition date
df_topics['date_define_topic'] = datetime.today().strftime('%Y-%m-%d')

# export csv
h.make_csv(df_topics, 'esg_topics.csv', '../../data/interim', header=True)
Example #9
0
    """

    pattern = r"(\s|\.|\,|\&)*(\.com|Enterprise|Worldwide|Int\'l|N\.V\.|LLC|Co\b|Inc\b|Corp\w*|Group\sInc|Group|Company|Holdings\sInc|\WCo(\s|\.)|plc|Ltd|Int'l\.|Holdings|\(?Class\s\w+\)?)\.?\W?"
    stripped_names = [re.sub(pattern, '', n) for n in raw_names]

    return stripped_names


# get firm S&P500 from Wikipedia
keep_columns = ['Symbol', 'Security', 'GICS Sector']
df_sp500_wiki = get_firms_sp500().loc[:, keep_columns]

# rename column, set ticker as index
df_sp500_wiki = df_sp500_wiki.rename(columns={
    'Symbol': 'ticker',
    'Security': 'firm_name_raw',
    'GICS Sector': 'sector'
})

# process firm names (remove legal entity, suffix)
df_sp500_wiki['firm_name_processed'] = regex_strip_legalname(
    list(df_sp500_wiki.firm_name_raw))

# add retrieval date
df_sp500_wiki['date_get_firmname'] = datetime.today().strftime('%Y-%m-%d')

# drop duplicate firm names
df_sp500_wiki.drop_duplicates(subset='firm_name_processed', inplace=True)

h.make_csv(df_sp500_wiki, 'firm_names.csv', '../../data/interim', header=True)