def load_to_sql(self, tables_to_load, table_location): """ The main function that processes and loads the auxiliary data into the database. For each table listed in the tables_to_load list, their CSV file is loaded and the data moved into the SQL database. If the table is for indices, the CSV data is passed to the find_symbol_id function, where the ticker is replaced with it's respective symbol_id. :param tables_to_load: List of strings :param table_location: String of the directory for the load tables :return: Nothing. Data is just loaded into the SQL database. """ start_time = time.time() for table, query in tables.items(): if table in tables_to_load: try: file = os.path.abspath(os.path.join(table_location, table + '.csv')) table_df = pd.read_csv(file, encoding='ISO-8859-1') except Exception as e: print('Unable to load the %s csv load file. Skipping it' % table) print(e) continue if table == 'indices' or table == 'tickers': # ToDo: Re-implement these tables; need symbol_id print('Unable to process indices and tickers table ' 'since there is no system to create a unique ' 'symbol_id for each item.') continue # # Removes the column that has the company's name # table_df.drop('ticker_name', 1, inplace=True) # # Finds the tsid for each ticker # table_df = self.find_tsid(table_df) # if table == 'tickers': # table_df.to_csv('load_tables/tickers_df.csv', # index=False) # Retrieve any existing values for this table existing_df = query_load_table( database=self.database, user=self.user, password=self.password, host=self.host, port=self.port, table=table) # Find the values that are different between the two DataFrames altered_df = self.altered_values( existing_df=existing_df, new_df=table_df) altered_df.insert(len(altered_df.columns), 'created_date', datetime.now().isoformat()) altered_df.insert(len(altered_df.columns), 'updated_date', datetime.now().isoformat()) # Get the id column for the current table (first column) id_col_name = list(altered_df.columns.values)[0] # Separate out the new and updated values from the altered_df new_df = (altered_df[~altered_df[id_col_name]. isin(existing_df[id_col_name])]) updated_df = (altered_df[altered_df[id_col_name]. isin(existing_df[id_col_name])]) # Update all modified values within the database update_load_table(database=self.database, user=self.user, password=self.password, host=self.host, port=self.port, values_df=updated_df, table=table) # Append all new values to the database df_to_sql(database=self.database, user=self.user, password=self.password, host=self.host, port=self.port, df=new_df, sql_table=table, exists='append', item=table) print('Loaded %s into the %s database' % (table, self.database)) load_tables_excluded = [table for table in tables_to_load if table not in tables.keys()] if load_tables_excluded: print('Unable to load the following tables: %s' % (", ".join(load_tables_excluded))) print("If the CSV file exists, make sure it's name matches the " "name in the tables dictionary.") print('Finished loading all selected tables taking %0.1f seconds' % (time.time() - start_time))
def create_symbology(database, user, password, host, port, source_list): """ Create the symbology table. Use the CSI numbers as the unique symbol identifiers. See if they already exist within the symbology table, and if not, add them. For the source, use either 'csi_data' or the data vendor ID. For the source_id, use the actual CSI number. After the initial unique symbols are created, map the Quandl codes to their respective symbol ID. This can be done using the ticker and exchange combination, as it will always be unique (for active tickers...). For Quandl codes that don't have a match, create a unique symbol ID for them in a seperate number range (1M - 2M; B#). Use this same matching structure for mapping basket items. Steps: 1. Download the CSI stockfacts data and store in database 2. Add the CSI data's number ID to the symbology table if it hasn't been done yet 3. Map Quandl codes to a unique ID :param database: String of the database name :param user: String of the username used to login to the database :param password: String of the password used to login to the database :param host: String of the database address (localhost, url, ip, etc.) :param port: Integer of the database port number (5432) :param source_list: List of strings with the symbology sources to use """ exch_df = query_exchanges(database=database, user=user, password=password, host=host, port=port) # ToDo: Add economic_events codes for source in source_list: source_start = time.time() # Retrieve any existing ID values from the symbology table existing_symbology_df = query_existing_sid(database=database, user=user, password=password, host=host, port=port, source=source) if source == 'csi_data': csi_stock_df = query_csi_stocks(database=database, user=user, password=password, host=host, port=port, query='all') # csi_data is unique where the sid (csi_num) is the source_id csi_stock_df['ticker'] = csi_stock_df['sid'] # Find the values that are different between the two DataFrames altered_values_df = altered_values( existing_df=existing_symbology_df, new_df=csi_stock_df) # Prepare a new DataFrame with all relevant data for these values altered_df = pd.DataFrame() altered_df.insert(0, 'symbol_id', altered_values_df['sid']) altered_df.insert(1, 'source', source) altered_df.insert(2, 'source_id', altered_values_df['sid']) altered_df.insert(3, 'type', 'stock') altered_df.insert(len(altered_df.columns), 'created_date', datetime.now().isoformat()) altered_df.insert(len(altered_df.columns), 'updated_date', datetime.now().isoformat()) elif source in ['tsid', 'quandl_wiki', 'quandl_goog', 'seeking_alpha', 'yahoo']: # These sources have a similar symbology creation process if source == 'quandl_wiki': # I don't trust that Quandl provides all available WIKI codes # in the downloadable tables, thus I imply plausible WIKI # codes: NYSE and NASDAQ that are active or recently delisted. # DataFrame of main US active tickers, their exchanges and # child exchanges csi_stock_df = query_csi_stocks(database=database, user=user, password=password, host=host, port=port, query='main_us') # If a ticker has a ". + -", change it to an underscore csi_stock_df['ticker'].replace(regex=True, inplace=True, to_replace=r'[.+-]', value=r'_') # Need to add 'WIKI/' before every ticker to make it # compatible with the Quandl WIKI code structure csi_stock_df['ticker'] = csi_stock_df['ticker'].\ apply(lambda x: 'WIKI/' + x) elif source == 'quandl_goog': # Imply plausible Quandl codes for their GOOG database. Only # codes for American, Canadian and London exchanges csi_stock_df = query_csi_stocks(database=database, user=user, password=password, host=host, port=port, query='exchanges_only') # If a ticker has a ". + -", change it to an underscore csi_stock_df['ticker'].replace(regex=True, inplace=True, to_replace=r'[.+-]', value=r'_') def csi_to_quandl_goog(row): # Create the Quandl GOOG symbol combination of # GOOG/<goog exchange symbol>_<ticker> ticker = row['ticker'] exchange = row['exchange'] child_exchange = row['child_exchange'] if child_exchange == 'NYSE ARCA': # NYSE ARCA is a special situation where the child # exchange matches the csi_symbol goog_exch = (exch_df.loc[exch_df['csi_symbol'] == child_exchange, 'goog_symbol'].values) if goog_exch: return 'GOOG/' + goog_exch[0] + '_' + ticker else: print('Unable to find the goog exchange symbol for ' 'the child exchange %s in csi_to_quandl_goog' % child_exchange) else: # For all non NYSE ARCA exchanges, see if there is a # child exchange and if so, try matching that to the # csi_symbol (first) or name (second). If no child # exchange, try matching to the csi_symbol. if child_exchange: # (exch: AMEX | chld_exch: NYSE) goog_exch = (exch_df.loc[exch_df['csi_symbol'] == child_exchange, 'goog_symbol'].values) if goog_exch: return 'GOOG/' + goog_exch[0] + '_' + ticker else: # (exch: NYSE | chld_exch: OTC Markets QX) # (exch: AMEX | chld_exch: BATS Global Markets) goog_exch = (exch_df.loc[exch_df['name'] == child_exchange, 'goog_symbol']. values) if goog_exch: return 'GOOG/' + goog_exch[0] + '_' + ticker else: print('Unable to find the goog exchange ' 'symbol for the child exchange %s in ' 'csi_to_quandl_goog. Will try to ' 'find a match for the exchange now.' % child_exchange) # If there is an exchange, try to match that if exchange: # Either no child exchange or the child exchange # never found a match goog_exch = (exch_df.loc[exch_df['csi_symbol'] == exchange, 'goog_symbol'].values) if goog_exch: return 'GOOG/' + goog_exch[0] + '_' + ticker else: print('Unable to find the goog exchange symbol ' 'for the exchange %s in ' 'csi_to_quandl_goog' % exchange) else: print('Unable to find the goog exchange symbol for ' 'either the exchange or child exchange for ' '%s:%s' % (exchange, child_exchange)) csi_stock_df['ticker'] = csi_stock_df.apply(csi_to_quandl_goog, axis=1) elif source == 'seeking_alpha': # Use main US tickers that should have Seeking Alpha articles csi_stock_df = query_csi_stocks(database=database, user=user, password=password, host=host, port=port, query='main_us') # If a ticker has a ". + -", change it to an underscore csi_stock_df['ticker'].replace(regex=True, inplace=True, to_replace=r'[.+-]', value=r'_') elif source == 'tsid': # Build tsid codes (<ticker>.<exchange>.<position>), albeit # only for American, Canadian and London exchanges. csi_stock_df = query_csi_stocks(database=database, user=user, password=password, host=host, port=port, query='exchanges_only') # If a ticker has a ". + -", change it to an underscore csi_stock_df['ticker'].replace(regex=True, inplace=True, to_replace=r'[.+-]', value=r'_') def csi_to_tsid(row): # Create the tsid symbol combination of: # <ticker>.<tsid exchange symbol>.<count> ticker = row['ticker'] exchange = row['exchange'] child_exchange = row['child_exchange'] if child_exchange == 'NYSE ARCA': # NYSE ARCA is a special situation where the child # exchange matches the csi_symbol tsid_exch = (exch_df.loc[exch_df['csi_symbol'] == child_exchange, 'tsid_symbol'].values) if tsid_exch: return ticker + '.' + tsid_exch[0] + '.0' else: print('Unable to find the tsid exchange symbol for ' 'the child exchange %s in csi_to_tsid' % child_exchange) else: # For all non NYSE ARCA exchanges, see if there is a # child exchange and if so, try matching that to the # csi_symbol (first) or name (second). If no child # exchange, try matching to the csi_symbol. if child_exchange: # (exch: AMEX | chld_exch: NYSE) tsid_exch = (exch_df.loc[exch_df['csi_symbol'] == child_exchange, 'tsid_symbol'].values) if tsid_exch: return ticker + '.' + tsid_exch[0] + '.0' else: # (exch: NYSE | chld_exch: OTC Markets QX) # (exch: AMEX | chld_exch: BATS Global Markets) tsid_exch = (exch_df.loc[exch_df['name'] == child_exchange, 'tsid_symbol']. values) if tsid_exch: return ticker + '.' + tsid_exch[0] + '.0' else: print('Unable to find the tsid exchange' 'symbol for the child exchange %s in ' 'csi_to_tsid. Will try to ' 'find a match for the exchange now.' % child_exchange) # If there is an exchange, try to match that if exchange: # Either no child exchange or the child exchange # never found a match tsid_exch = (exch_df.loc[exch_df['csi_symbol'] == exchange, 'tsid_symbol'].values) if tsid_exch: return ticker + '.' + tsid_exch[0] + '.0' else: print('Unable to find the tsid exchange symbol ' 'for the exchange %s in csi_to_tsid' % exchange) else: print('Unable to find the tsid exchange symbol for ' 'either the exchange or child exchange for ' '%s:%s' % (exchange, child_exchange)) csi_stock_df['ticker'] = csi_stock_df.apply(csi_to_tsid, axis=1) elif source == 'yahoo': # Imply plausible Yahoo codes, albeit only for American, # Canadian and London exchanges. csi_stock_df = query_csi_stocks(database=database, user=user, password=password, host=host, port=port, query='exchanges_only') # If a ticker has a ". + -", change it to an underscore csi_stock_df['ticker'].replace(regex=True, inplace=True, to_replace=r'[.+-]', value=r'_') def csi_to_yahoo(row): # Create the Yahoo symbol combination of <ticker>.<exchange> ticker = row['ticker'] exchange = row['exchange'] child_exchange = row['child_exchange'] us_exchanges = ['AMEX', 'BATS Global Markets', 'Nasdaq Capital Market', 'Nasdaq Global Market', 'Nasdaq Global Select', 'NYSE', 'NYSE ARCA'] if exchange in ['AMEX', 'NYSE'] \ or child_exchange in us_exchanges: return ticker # US ticker; no exchange needed elif exchange == 'LSE': return ticker + '.L' # LSE -> L elif exchange == 'TSX': return ticker + '.TO' # TSX -> TO elif exchange == 'VSE': return ticker + '.V' # VSE -> V elif child_exchange == 'OTC Markets Pink Sheets': return ticker + '.PK' # OTC Pinks -> PK else: print('csi_to_yahoo did not find a match for %s with an' 'exchange of %s and a child exchange of %s' % (ticker, exchange, child_exchange)) csi_stock_df['ticker'] = csi_stock_df.apply(csi_to_yahoo, axis=1) else: return NotImplementedError('%s is not implemented in the ' 'create_symbology function of ' 'build_symbology.py' % source) # Remove post processed duplicates to prevent database FK errors csi_stock_df.drop_duplicates(subset=['ticker'], inplace=True) # Find the values that are different between the two DataFrames altered_values_df = altered_values( existing_df=existing_symbology_df, new_df=csi_stock_df) # Prepare a new DataFrame with all relevant data for these values altered_df = pd.DataFrame() altered_df.insert(0, 'symbol_id', altered_values_df['sid']) altered_df.insert(1, 'source', source) altered_df.insert(2, 'source_id', altered_values_df['ticker']) altered_df.insert(3, 'type', 'stock') altered_df.insert(len(altered_df.columns), 'created_date', datetime.now().isoformat()) altered_df.insert(len(altered_df.columns), 'updated_date', datetime.now().isoformat()) else: return NotImplementedError('%s is not implemented in the ' 'create_symbology function of ' 'build_symbology.py' % source) # Separate out the updated values from the altered_df updated_symbols_df = (altered_df[altered_df['symbol_id']. isin(existing_symbology_df['symbol_id'])]) # Update all modified symbology values in the database update_symbology_values(database=database, user=user, password=password, host=host, port=port, values_df=updated_symbols_df) # Separate out the new values from the altered_df new_symbols_df = (altered_df[~altered_df['symbol_id']. isin(existing_symbology_df['symbol_id'])]) # Append the new symbol values to the existing database table df_to_sql(database=database, user=user, password=password, host=host, port=port, df=new_symbols_df, sql_table='symbology', exists='append', item=source) print('Finished processing the symbology IDs for %s taking ' '%0.2f seconds' % (source, (time.time() - source_start))) print('Added all %i sources to the symbology table.' % (len(source_list),))
def validator(self, tsid): tsid_start = time.time() # DataFrame of all stored prices for this ticker and interval. This is # a multi-index DataFrame, with date and data_vendor_id in the index. tsid_prices_df = query_all_tsid_prices( database=self.database, user=self.user, password=self.password, host=self.host, port=self.port, table=self.table, tsid=tsid) unique_sources = tsid_prices_df.index.\ get_level_values('data_vendor_id').unique() unique_dates = tsid_prices_df.index.get_level_values('date').unique() # If a period is provided, limit the unique_dates list to only those # within the past n period days. if self.period: beg_date = datetime.today() - timedelta(days=self.period) unique_dates = unique_dates[unique_dates > beg_date] # The consensus_price_df contains the prices from weighted consensus if self.table == 'daily_prices': consensus_price_df = pd.DataFrame( columns=['date', 'open', 'high', 'low', 'close', 'volume', 'ex_dividend', 'split_ratio']) elif self.table == 'minute_prices': consensus_price_df = pd.DataFrame( columns=['date', 'open', 'high', 'low', 'close', 'volume']) else: raise NotImplementedError('Table %s is not implemented within ' 'CrossValidate.validator' % self.table) # Set the date as the index consensus_price_df.set_index(['date'], inplace=True) # Cycle through each period, comparing each data source's prices for date in unique_dates: # Either add each field's consensus price to a dictionary, # which is entered into the consensus_price_df upon all fields # being processed, or enter each field's consensus price directly # into the consensus_price_df. Right now, this is doing the later. # consensus_prices = {} try: # Create a DataFrame for the current period, with the source_ids # as the index and the data_columns as the column headers period_df = tsid_prices_df.xs(date, level='date') except KeyError: # Should never happen print('Unable to extract the %s period\'s prices from ' 'the tsid_prices_df for %s' % (date, tsid)) finally: # Transpose the period_df DataFrame so the source_ids are # columns and the price fields are the rows period_df = period_df.transpose() # Cycle through each price field for this period's values for field_index, field_data in period_df.iterrows(): # field_index: string of the index name # field_data: Pandas Series (always??) of the field data # Reset the field consensus for every field processed field_consensus = {} # Cycle through each source's values that are in the # field_data Series. for source_data in field_data.iteritems(): # source_data is a tuple, with the first item is being # the data_vendor_id and the second being the value. # If the source_data's id is in the exclude list, don't # use its price when calculating the field consensus. if source_data[0] not in self.source_id_exclude_list: # Only process the source value if it is not None if source_data[1] is not None: # Retrieve weighted consensus for this source source_weight = self.source_weights_df.loc[ self.source_weights_df['data_vendor_id'] == source_data[0], 'consensus_weight'] try: if field_consensus: # There's already a value for this field if source_data[1] in field_consensus: # This source's value has a match in # the current consensus. Increase # weight for this price. field_consensus[source_data[1]] += \ source_weight.iloc[0] else: # Data value from the source does # not match this field's consensus field_consensus[source_data[1]] = \ source_weight.iloc[0] else: # Add first price to the field_consensus # dictionary, using price as the key # and the source's weight as the item. field_consensus[source_data[1]] = \ source_weight.iloc[0] except IndexError: # No source_weight was found, prob because # there was no data_vendor_id for value pass # Insert the highest consensus value for this period into # the consensus_price_df (the dictionary key (price) with # the largest value (consensus sum). try: consensus_value = max(field_consensus.items(), key=operator.itemgetter(1))[0] except ValueError: # None of the sources had any values, thus use -1 consensus_value = -1 consensus_price_df.ix[date, field_index] = consensus_value # Make the date index into a normal column consensus_price_df.reset_index(inplace=True) # Convert the datetime object to an ISO date consensus_price_df['date'] = consensus_price_df['date'].\ apply(lambda x: x.isoformat()) # Add the vendor id of the pySecMaster_Consensus as a column validator_id = query_data_vendor_id( database=self.database, user=self.user, password=self.password, host=self.host, port=self.port, name='pySecMaster_Consensus') consensus_price_df.insert(0, 'data_vendor_id', validator_id) consensus_price_df.insert(1, 'source', 'tsid') consensus_price_df.insert(2, 'source_id', tsid) # Add the current date to the last column consensus_price_df.insert(len(consensus_price_df.columns), 'updated_date', datetime.now().isoformat()) if validator_id in unique_sources: delete_start = time.time() # Data from the cross validation process has already been saved # to the database before, thus it must be removed before adding # the new calculated values. if self.period: # Only delete prior consensus values for this tsid that are # newer than the beg_date (current date - replace period). delete_query = ("""DELETE FROM %s WHERE source_id='%s' AND data_vendor_id='%s' AND date>'%s'""" % (self.table, tsid, validator_id, beg_date.isoformat())) else: # Delete all existing consensus values for this tsid. delete_query = ("""DELETE FROM %s WHERE source_id='%s' AND data_vendor_id='%s'""" % (self.table, tsid, validator_id)) retry_count = 5 while retry_count > 0: retry_count -= 1 delete_status = delete_sql_table_rows( database=self.database, user=self.user, password=self.password, host=self.host, port=self.port, query=delete_query, table=self.table, item=tsid) if delete_status == 'success': # Add the validated values to the relevant price table AFTER # ensuring that the duplicates were deleted successfully df_to_sql(database=self.database, user=self.user, password=self.password, host=self.host, port=self.port, df=consensus_price_df, sql_table=self.table, exists='append', item=tsid) break # print('Data table replacement took %0.2f' % # (time.time() - delete_start)) else: # Add the validated values to the relevant price table df_to_sql(database=self.database, user=self.user, password=self.password, host=self.host, port=self.port, df=consensus_price_df, sql_table=self.table, exists='append', item=tsid) # For period updates, slow down the process to allow postgre to catch up if self.period: time.sleep(1.5) if self.verbose: print('%s data cross-validation took %0.2f seconds to complete.' % (tsid, time.time() - tsid_start))