def ccm_block(upload_type, conn): if upload_type == 'from_wrds': Querry = """ select gvkey, lpermno as permno, linktype, linkprim, linkdt, linkenddt from crsp.ccmxpf_linktable where substr(linktype,1,1)='L' and (linkprim ='C' or linkprim='P') """ try: ccm = conn.raw_sql(Querry) except: # OperationalError: conn = wrds.Connection() ccm = conn.raw_sql(Querry) elif upload_type == 'from_file': #ccm = pd.read_csv('.data/ccm.csv.gz', compression='gzip') ccm = pd.read_pickle('data/ccm.pkl') if bool(np.isin('Unnamed: 0', ccm.columns)): ccm = ccm.drop('Unnamed: 0', axis=1) ccm['linkdt'] = pd.to_datetime(ccm['linkdt']) ccm['linkenddt'] = pd.to_datetime(ccm['linkenddt']) # if linkenddt is missing then set to today date ccm['linkenddt'] = ccm['linkenddt'].fillna(pd.to_datetime('today')) return ccm
def dump_boardex_data(): db = wrds.Connection() company_networks = db.raw_sql("select * from jr_wrds_company_networks") profile = db.raw_sql("select * from jr_dir_profile_details") org_summary = db.raw_sql("select * from jr_wrds_org_summary") individual_networks = db.raw_sql( "select * from jr_wrds_individual_networks") # filter individual only from the summary tables filter_dids = set(org_summary['directorid'].values) filter_bids = set(org_summary['boardid'].values) individual_networks = individual_networks[ individual_networks['companyid'].isin(list(filter_bids))] individual_networks = individual_networks[ individual_networks['dirbrdid'].isin(list(filter_dids))] individual_networks = individual_networks.replace('Curr', 2015) org_summary.to_excel('data/wrds/org_summary.xlsx', index=False) summary = pd.read_excel('data/wrds/org_summary.xlsx') total_summary_info = {} for year in range(2005, 2015): summary_info = collections.defaultdict(dict) sum_year = summary[summary['annualreportdate'].dt.year == year] for bid, groups in sum_year.groupby('boardid'): for did, gender in zip(groups['directorid'], groups['gender']): summary_info[bid][did] = gender total_summary_info[year] = summary_info with open('data/wrd/summary.pkl', 'wb') as file: pickle.dump(total_summary_info, file)
def connect(self): """ Connect to the Postgres via WRDS. """ self.wrdsconn = wrds.Connection() self.conn = self.wrdsconn.connect() return self.wrdsconn
def __init__(self, wrds_username='******', freq='M', output_dir=None, fields_ratios=None, fields_factors=None, all_chars=None): self.freq = freq self.db = wrds.Connection(wrds_username=wrds_username) self.fields_ratios = fields_ratios or FIELDS_RATIOS self.fields_factors = fields_factors or FIELDS_FACTORS self.fields_price = FIELDS_PRICE self.all_chars = all_chars or ALL_CHARS dates_d = pd.read_csv(DATES_D_PATH) dates_m = pd.read_csv(DATES_M_PATH) dates_f = pd.read_csv(DATES_F_PATH) self.dates_d = pd.DatetimeIndex(dates_d['date']) self.dates_m = pd.DatetimeIndex(dates_m['date']) self.dates_f = pd.DatetimeIndex(dates_f['date']) macro = pd.read_csv(MACRO_PATH) macro = macro.set_index('date') macro.index = pd.to_datetime(macro.index) self.macro = macro self.output_dir = output_dir or OUTPUT_DATA_DIR if not os.path.isdir(self.output_dir): os.makedirs(self.output_dir)
def determine_available_wrds_data(db=None): print('Determining available wrds data') if db is None: db = wrds.Connection() col_schema = [] col_table = [] for library in db.schema_perm: for table in db.list_tables(library=library): col_schema.append(library) col_table.append(table) df = pd.DataFrame({'schema': col_schema, 'table': col_table}) col_permno = [] col_permitted = [] for index, row in df.iterrows(): print("{} - {}.{}".format(index, row.schema, row.table)) query = "select * from {}.{} limit 0".format(row.schema, row.table) try: cols = db.raw_sql(query).columns col_permitted.append(True) has_permno = cols.str.contains('permno').any() col_permno.append(has_permno) except: col_permitted.append(False) col_permno.append(False) df['permitted'] = col_permitted df['has_permno'] = col_permno writer = pd.ExcelWriter('./wrds-libraries.xlsx') df.to_excel(writer, 'Sheet1') writer.save()
def connection(self): """wrds.Connection: Connection to WRDS database with credentials.""" if self._connection: return self._connection else: self._connection = wrds.Connection() return self._connection
def add_gvkey(df): if isinstance(df, list): df = pd.DataFrame({'ciks': df}) if (not isinstance(df, pd.DataFrame)) or (not 'ciks' in df.columns.values.tolist()): print( '''Argument must be dataframe (containing 'ciks') or list of CIKs''' ) return db = wrds.Connection(wrds_username='******') table = db.get_table(library='crsp_a_ccm', table='ccm_lookup') gvkeys = [] for cik in df['ciks']: masked_table = table[table['cik'] == cik] gvkey = masked_table.iloc[0].loc['gvkey'] gvkeys.append(gvkey) gvkeys_df = pd.DataFrame({'gvkeys': gvkeys}) augmented_df = pd.merge(gvkeys_df, df, how='outer', left_index=True, right_index=True) return augmented_df
def setUp(self): self.t = wrds.Connection(autoconnect=False) self.t._Connection__get_user_credentials = mock.Mock() self.t._Connection__get_user_credentials.return_value = ( 'faketestusername', 'faketestpassword') self.t._Connection__create_pgpass_file_win32 = mock.Mock() self.t._Connection__create_pgpass_file_unix = mock.Mock()
def __init__(self, wrds_username: str, selection_start_date: date = None, selection_end_date: date = None, observation_start_date: date = None, observation_end_date: date = None): self.username = wrds_username self.selection_start_date = selection_start_date self.selection_end_date = selection_end_date self.observation_start_date = observation_start_date self.observation_end_date = observation_end_date self.dataset = None self._names_table = None self._company_table = None self._executive_table = None self._annuals_table = None # To build a connection to the wrds server via python, a .pgpass file is required in the user's home # directory, with access limited to the user. # To create this file, follow the instructions here: # https://wrds-www.wharton.upenn.edu/pages/support/programming-wrds/programming-python/python-from-your-computer # After creating the file, don't forget to run "chmod 0600 ~/.pgpass" in the console to limit access. # Access issue also described here: # https://www.postgresql.org/docs/9.5/libpq-pgpass.html. self.db = _wrds.Connection(wrds_username=self.username)
def GetStocksPriceRecommendations(params): if params.type == type_price_target: entete = ['ticker', 'cusip', 'estimid', 'horizon', 'value', 'estcur', 'anndats', 'amaskcd'] sqlstmt = 'select pt.*, B.exrat FROM(select ' + ','.join(entete) + ' FROM {schema}.{table} ' \ .format(schema='ibes', table='ptgdet',) +' ) As pt ' \ 'LEFT JOIN ibes.hdxrati B ON (pt.anndats = B.anndats AND pt.estcur = B.curr) ' if params.type == type_consensus: entete = ['ticker', 'cusip','emaskcd', 'ireccd', 'anndats', 'amaskcd'] sqlstmt = 'select ' + ','.join(entete) + ' From {schema}.{table} '.format( schema='ibes', table='recddet', ) try: db = wrds.Connection() res = db.raw_sql(sqlstmt) db.close() np.save(params.type + '_data', res) except exc.SQLAlchemyError as e: print(e) return "Error Loading File" finally: db.close()
def crsp_block(upload_type, conn): if upload_type == 'from_wrds': #Market data Querry_crsp = """ select a.permno, a.permco, a.date, b.shrcd, b.exchcd, a.ret, a.retx, a.shro$^ut, a.prc from crsp.msf as a left join crsp.msenames as b on a.permno=b.permno and b.namedt<=a.date and a.date<=b.nameendt where a.date between '01/01/1959' and '12/31/2017' and b.exchcd between 1 and 3 """ Querry_dlret = """ select permno, dlret, dlstdt from crsp.msedelist """ try: crsp_m = conn.raw_sql(Querry_crsp) dlret = conn.raw_sql(Querry_dlret) except: # OperationalError: conn = wrds.Connection() crsp_m = conn.raw_sql(Querry_crsp) dlret = conn.raw_sql(Querry_dlret) elif upload_type == 'from_file': #crsp_m = pd.read_csv('.data/crsp_m.csv.gz', compression='gzip') crsp_m = pd.read_pickle('data/crsp_m_modified.pkl') if bool(np.isin('Unnamed: 0', crsp_m.columns)): crsp_m = crsp_m.drop('Unnamed: 0', axis=1) #dlret = pd.read_csv('.data/dlret.csv.gz', compression='gzip') dlret = pd.read_pickle('data/dlret.pkl') if bool(np.isin('Unnamed: 0', dlret.columns)): dlret = dlret.drop('Unnamed: 0', axis=1) # change variable format to int crsp_m[['permco', 'permno', 'shrcd', 'exchcd']] = crsp_m[['permco', 'permno', 'shrcd', 'exchcd']].astype(int) # Line up date to be end of month crsp_m['date'] = pd.to_datetime(crsp_m['date']) crsp_m['jdate'] = crsp_m['date'] + MonthEnd(0) # add delisting return dlret.permno = dlret.permno.astype(int) dlret['dlstdt'] = pd.to_datetime(dlret['dlstdt']) dlret['jdate'] = dlret['dlstdt'] + MonthEnd(0) crsp = pd.merge(crsp_m, dlret, how='left', on=['permno', 'jdate']) crsp['dlret'] = crsp['dlret'].fillna(0) crsp['ret'] = crsp['ret'].fillna(0) crsp['retadj'] = (1 + crsp['ret']) * (1 + crsp['dlret']) - 1 crsp['me'] = crsp['prc'].abs() * crsp['shrout'] # calculate market equity crsp = crsp.drop(['dlret', 'dlstdt', 'shrout'], axis=1) crsp = crsp.sort_values(by=['jdate', 'permco', 'me']) return crsp
def test_init_calls_sqlalchemy_create_engine_defaults(self, mock_sa): t = wrds.Connection() connstring = 'postgresql://{host}:{port}/{dbname}' connstring = connstring.format(host=wrds.sql.WRDS_POSTGRES_HOST, port=wrds.sql.WRDS_POSTGRES_PORT, dbname=wrds.sql.WRDS_POSTGRES_DB) mock_sa.create_engine.assert_called_with( connstring, connect_args={'sslmode': 'require'})
def database_wrds(self, main, wrds_file): file = open(wrds_file, 'r') lines = file.read().splitlines(True) file.close() self.library = lines[0].split("=")[1].strip() self.table = lines[1].split("=")[1].strip() self.main = main self.database = wrds.Connection(wrds_username="******") self.title = wrds_file
def setUp(self): self.t = wrds.Connection(autoconnect=False) self.t._hostname = 'wrds.test.private' self.t._port = 12345 self.t._username = '******' self.t._password = '******' self.t._dbname = 'testdbname' self.t._Connection__get_user_credentials = mock.Mock() self.t._Connection__get_user_credentials.return_value = ( self.t._username, self.t._password)
def test_init_calls_sqlalchemy_create_engine_custom(self, mock_sa): username = '******' connstring = 'postgresql://{usr}@{host}:{port}/{dbname}' connstring = connstring.format(usr=username, host=wrds.sql.WRDS_POSTGRES_HOST, port=wrds.sql.WRDS_POSTGRES_PORT, dbname=wrds.sql.WRDS_POSTGRES_DB) t = wrds.Connection(wrds_username=username) mock_sa.create_engine.assert_called_with( connstring, connect_args={'sslmode': 'require'})
def main(): import sys if len(sys.argv) != 2: print("Specify function") return function = sys.argv[1] all_globals = globals() if function in all_globals and callable(all_globals[function]): db = wrds.Connection() all_globals[function](db)
def GetStocksPriceData(params): if params.globalWRDS: entete = ['gvkey', 'datadate', 'ajexdi', 'cshoc', 'cshtrd', 'prccd', 'prchd', 'prcld', 'curcdd', 'isin', 'iid'] else: entete = ['gvkey', 'datadate', 'ajexdi', 'cshoc', 'cshtrd', 'prccd', 'prchd', 'prcld', 'curcdd', 'cusip', 'iid'] try: order = entete[-2] entete_table = ','.join(entete) sqlstmt = 'select pt.*, B.exrat FROM(select ' + entete_table + ' FROM {schema}.{table} ' \ 'ORDER BY '.format(schema=params.library, table=params.table,) + order + ' LIMIT {limit} OFFSET {offset}) As pt ' \ 'LEFT JOIN ibes.hdxrati B ON (pt.datadate = B.anndats AND pt.curcdd = B.curr) '.format( limit= params.observation, offset=params.offset ) db = wrds.Connection() res = db.raw_sql(sqlstmt) db.close() res.set_index('datadate') res = res[res['curcdd'].notnull()] res['global'] = params.globalWRDS v = np.vectorize(applyChecking) res['data'] = v(res[entete[9]], res[entete[0]], res[entete[1]], res[entete[8]], res[entete[3]], res[entete[4]], res[entete[2]], res[entete[5]], res[entete[6]], res[entete[7]], res[entete[10]], res['exrat'], res['global']) res = res[['datadate', 'data']] v = np.vectorize(convertDateToString) res['date'] = v(res['datadate']) tab = res.groupby('date').apply(lambda x: applyGroupBy(x)) ClientDB = motor.motor_tornado.MotorClient(ProdConnectionString) loop = tornado.ioloop.IOLoop loop.current().run_sync(StocksMarketDataPrice(ClientDB, "ALL", tab).SetManyStocksPriceInDB) ClientDB.close() print('lot : [', params.offset, ", ", params.observation + params.offset, "] Completed") return 'lot : [', params.offset, ", ", params.observation + params.offset, "] Completed" except exc.SQLAlchemyError as e: print('lot : [', params.offset, ", ", params.observation + params.offset, "] Not Completed") return 'lot : [', params.offset, ", ", params.observation + params.offset, "] Not Completed" finally: db.close()
def get_gvkey(): aws_param = my.get_credentials(credential='aws') with my.postgresql_connect(aws_param) as conn: df_companies = my.sql_query(sql="SELECT * FROM reuters.company_list", conn=conn) company_list = [ i.split('.')[0] for i in df_companies['constituent_ric'].tolist() ] wrds_username = my.get_credentials( credential='wrds_credentials')['username'] wrds_password = my.get_credentials( credential='wrds_credentials')['password'] wrds_conn = wrds.Connection(wrds_username=wrds_username, wrds_password=wrds_password) gvkey_mapping = wrds_conn.raw_sql( my.create_wrds_sql_query(table='security', columns=['tic', 'gvkey'], distinct=True, conditions={'tic': company_list}, no_observations=-1)) gvkey_list = gvkey_mapping['gvkey'].tolist() wrds_companies = wrds_conn.raw_sql( my.create_wrds_sql_query( table='company', columns=['gvkey', 'conm', 'conml', 'fic', 'loc', 'weburl'], distinct=True, conditions={'gvkey': gvkey_list}, no_observations=-1)) df_companies['tic'] = company_list df_combined = pd.merge(df_companies, gvkey_mapping, on='tic', how='left') df_combined = pd.merge(df_combined, wrds_companies, on='gvkey', how='left') df_combined['match'] = df_combined.apply(lambda x: algorithims.levenshtein( x['tr_indexjlconstituentcomname'], x['conml']), axis=1) df_combined['match'].fillna(0, inplace=True) df_combined['row_add'] = [ i / 1000000 for i in list(range(len(df_combined), 0, -1)) ] df_combined['score'] = df_combined['match'] + df_combined['row_add'] grouped_df = df_combined.groupby('score') maxs = grouped_df.max() df_combined = maxs.reset_index() df_combined = df_combined[['constituent_ric', 'tic', 'gvkey']] df_combined['include_company'] = np.where(df_combined['gvkey'].isna(), False, True) data_dir = my.get_project_directories(key='data_dir') output_file_path = os.path.join(data_dir, 'reuters_wrds_mapping.csv') df_combined.to_csv(output_file_path, index=False)
def SetStocksInfosRecommendationsInDB(type, connectionstring): """ This function set all the Stocks Recommendations Infos in the DB. :param: type: price_target/consensus DB connectionstring. The DB location where the data will be store. """ if type == type_consensus: db = wrds.Connection() res = db.raw_sql("select a.cusip, a.ticker from ibes.recddet a group by a.cusip, a.ticker") db.close() elif type == type_price_target: db = wrds.Connection() res = db.raw_sql("select a.cusip, a.ticker from ibes.ptgdet a group by a.cusip, a.ticker") db.close() else: error = "Incorrection Argument Type It must be {} or {}." raise TypeError(error.format(type_price_target, type_consensus)) dict_infos = dict() for pos in range(res.shape[0]): cusip = res['cusip'][pos] ticker = res['ticker'][pos] if cusip is None: cusip = ticker dict_infos[(cusip, ticker)] = {'ticker': ticker, 'cusip': cusip} if (cusip != ticker): if dict_infos.get((ticker, ticker), False): del dict_infos[(ticker, ticker)] data = [] for key in dict_infos: data.append(dict_infos[key]) ClientDB = motor.motor_tornado.MotorClient(connectionstring) tornado.ioloop.IOLoop.current().run_sync(PriceTargetAndconsensusInfosData(ClientDB,type,data).SetInfosInDB) ClientDB.close()
def redownload_all_data(): db = wrds.Connection() fetch_and_store_sp500(db) recompute_optionsdata(option_type=option_type) download_vix_data(db) download_treasury_data() download_dividends_data(db) download_fundamentals_data(db) download_names_data(db)
def fetch_and_store_sp500(db): if db is None: db = wrds.Connection() # source historical S&P 500 constituents const = db.get_table('compm', 'idxcst_his') # source CRSP identifiers crsp_id = pd.read_csv(paths['sp500_permnos']) crsp_id = crsp_id[crsp_id['ending'] > "1990-12-31"] permnos = crsp_id['PERMNO'].values print('Loading Price Data') permnolist = ", ".join(str(x) for x in permnos) prices_raw = db.raw_sql( 'Select date, permno, cusip, PRC, shrout ' 'from crspa.dsf ' 'where permno in ({}) ' 'and date between \'{}-01-01\' and \'{}-01-01\''.format( permnolist, start_year, end_year)) prices_sp50 = prices_raw permnos_m = prices_sp50['permno'].unique() # Process the price data prc_merge = None for i in permnos_m: if i == permnos_m[0]: x = prices_sp50[prices_sp50['permno'] == i][['date', 'prc']].set_index( 'date', drop=True) x.columns = [i] prc_merge = x else: y = prices_sp50[prices_sp50['permno'] == i][['date', 'prc']].set_index( 'date', drop=True) y.columns = [i] prc_merge = pd.merge(prc_merge, y, how='outer', left_index=True, right_index=True) print('Price Data Loaded') with pd.HDFStore(paths['prices_raw']) as store: store['Compustat_const'] = const store['CRSP_const'] = crsp_id store['Prices_raw'] = prices_raw store['Prices'] = prc_merge return prc_merge, crsp_id
def download_vix_data(db): print('Downloading vix data') if db is None: db = wrds.Connection() query = ("select date, vix " "from cboe.cboe " "where date > '" + str(start_year) + "0101' " "and date < '" + str(end_year) + "0101' ") vix = db.raw_sql(query) store = pd.HDFStore(paths['vix']) store['vix'] = vix store.close()
def get_indices_d(): #download daily index data from CRSP with WRDS api and save as pandas df if os.path.isfile(data_input_path + 'crsp/indices_d.pkl'): indices_d = pd.read_pickle(data_input_path + 'crsp/indices_d.pkl') else: db = wrds.Connection() indices_d = db.raw_sql(''' select * from crsp.dsi ''') indices_d.to_pickle(data_input_path + 'crsp/indices_d.pkl') db.close() return indices_d
def get_crsp_divs(): #download CRSP dividend data from WRDS api and save as pandas df if os.path.isfile(data_input_path + 'crsp/crsp_divs.pkl'): crsp_divs = pd.read_pickle(data_input_path + 'crsp/crsp_divs.pkl') else: db = wrds.Connection() crsp_divs = db.raw_sql(''' select a.permno, a.divamt, a.dclrdt, a.exdt, a.paydt from crsp.msedist as a ''') crsp_divs.to_pickle(data_input_path + 'crsp/crsp_divs.pkl') db.close() return crsp_divs
def main(argv=None): # Testing loading 25 BM/OP portfolios: # df1, df2 = crsp_comp.load_BM_OP_25_portfolios() # crsp_comp.load_ff_12_industry() port = pd.read_csv("estimated_data/disaster_sorts/port_sort_const_agg.csv", index_col=[0, 1]) port = port.reset_index() port.form_date = pd.to_datetime(port.form_date) db = wrds.Connection() port = crsp_comp.get_ff_ind(db, port) print(port.head())
def ff_timeseries(upload_type, conn): # Dowload Fama-French Time series if upload_type == 'from_wrds': try: _ff = conn.get_table(library='ff', table='factors_monthly') except: # OperationalError: conn = wrds.Connection() _ff = conn.get_table(library='ff', table='factors_monthly') elif upload_type == 'from_file': _ff = pd.read_pickle(r'data/fama_french_ts.pkl') if bool(np.isin('Unnamed: 0', _ff.columns)): _ff = _ff.drop('Unnamed: 0', axis=1) return _ff
def make_db_connection(): """ creates connection to WRDS database need to enter credentials to log in """ wrds_uname = os.environ.get('wrds_username') wrds_pass = os.environ.get('wrds_password') # tries to use pgpass file; see here: # https://wrds-www.wharton.upenn.edu/pages/support/accessing-wrds-remotely/troubleshooting-pgpass-file-remotely/ db = wrds.Connection(wrds_username=wrds_uname, wrds_password=wrds_pass) # saves credentials, but not pgpass working # db.create_pgpass_file() return db
def __init__(self, connect: bool = True): """Sets up the Downloader to connect to WRDS. Will ask for WRDS credentials when connecting for the first time. Args: connect (bool): """ # database connection if connect: self._connection = wrds.Connection() else: self._connection = connect
def get_comp_dates(): #download and process mapping table for Compustat filing dates from WRDS api and save as pandas df if os.path.isfile(data_input_path + 'compustat/as_of_dates.pkl'): comp_dates = pd.read_pickle(data_input_path + 'compustat/as_of_dates.pkl') else: db = wrds.Connection() comp_dates = db.raw_sql(''' select distinct gvkey,datadate,rdq from comp.fundq ''') comp_dates.to_pickle(data_input_path + 'compustat/as_of_dates.pkl') db.close() return comp_dates
def get_ff_factors(): db = wrds.Connection() ff_query = """ select date, mktrf, smb, hml, umd, rf from factors_monthly """ factors_df = db.raw_sql(ff_query.replace('\n', ' ')) factors_df['date'] = pd.to_datetime(factors_df['date']) + MonthEnd(0) factors_df = factors_df.dropna( ) # all factors are available from 1927-01-01 through 2018-05-01 return factors_df