def get_lps_share_integral_for_pools(_df, _realtime=None, _exclusions={}): LOGGER.debug('get_lps_share_integral_for_pools') pools_list = _df.index.get_level_values('pool_address').drop_duplicates() lm_shares_df = pd.DataFrame() for pool in pools_list: LOGGER.info('Computing shares of incentives for pool ' + pool) pool_data = _df.loc[pool] uneligible_lps = [ address.lower() for address in _exclusions.get(pool, []) ] eligible_lps_pool_data = pool_data.query( f'lp_address not in {uneligible_lps}') if len(uneligible_lps) > 0: LOGGER.info( f'Total LPs: {len(pool_data.index.get_level_values("lp_address").drop_duplicates())}' ) LOGGER.info(f'Uneligible LPs: {uneligible_lps}') LOGGER.info( f'Eligilble LPs: {len(eligible_lps_pool_data.index.get_level_values("lp_address").drop_duplicates())}' ) lps_shares = get_lps_share_integral_for_pool( eligible_lps_pool_data, _realtime=_realtime, _exclusions=uneligible_lps) pool_df = pd.DataFrame(lps_shares) pool_df['pool_address'] = pool lm_shares_df = lm_shares_df.append(pool_df) lm_shares_df = lm_shares_df.reset_index().set_index( ['pool_address', 'lp_address']) return lm_shares_df
def train(model, epochs, optimizer, grad, train_dataset): """ A main training function for training the models. """ # Keep results for plotting train_loss_results = [] train_accuracy_results = [] for epoch in tqdm(range(epochs)): epoch_loss_avg = tf.keras.metrics.Mean() epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy() # Training loop - using batches of 32 for x, y, timeStep in train_dataset: # Optimize the model loss_value, grads = grad(model, x, y) optimizer.apply_gradients(zip(grads, model.trainable_variables)) # Track progress epoch_loss_avg(loss_value) # Add current batch loss # Compare predicted label to actual label # training=True is needed only if there are layers with different # behavior during training versus inference (e.g. Dropout). epoch_accuracy(y, model(x, training=True)) # End epoch train_loss_results.append(epoch_loss_avg.result()) train_accuracy_results.append(epoch_accuracy.result()) if epoch % 50 == 0: LOGGER.info("Epoch {:03d}: Loss: {:.3f}, Accuracy: {:.3%}".format( epoch, epoch_loss_avg.result(), epoch_accuracy.result()))
def get_lps_share_integral_for_pool(_df, _realtime=None, _exclusions={}): LOGGER.debug('get_lps_share_integral_for_pool') df = compute_LM_power_timeseries(_df) latest_timestamp = df.index.get_level_values('block_timestamp').max() intervals = compute_timestamp_intervals( df.index.get_level_values('block_timestamp'), _realtime) df = df.join(intervals) total_lm_power = compute_total_LM_power_timeseries(df) df = df.join(total_lm_power) df['lm_share'] = df['lm_power'] / df['total_lm_power'] df['share_integral'] = df['lm_share'] * df['state_duration'] latest_share = df.iloc[df.index.get_locs([slice(None), latest_timestamp])]['lm_share'] latest_share = latest_share.droplevel('block_timestamp') total_share_integral = df['share_integral'].sum() lp_lm_share = df.groupby( 'lp_address')['share_integral'].sum() / total_share_integral result = latest_share.to_frame().join(lp_lm_share) result.columns = ['latest_share', 'share_integral'] return result
def get_lm_allocations(_chain_id, _week_number=0, _realtime=None): LOGGER.debug('get_lm_allocations') week_passed = 3 / 7 if _realtime: _week_number = get_current_lm_week_number() week_passed = get_percent_week_passed() jsonurl = urlopen(V2_LM_ALLOCATION_URL) try: week_allocation = json.loads(jsonurl.read())[f'week_{_week_number}'] except KeyError: week_allocation = {} for chain_allocation in week_allocation: if chain_allocation['chainId'] == _chain_id: df = pd.DataFrame() for pool, rewards in chain_allocation['pools'].items(): for r in rewards: if r['tokenAddress'] in EXCLUDED_POOLS_TOKENS.get( pool, []): continue pool_address = pool[:42].lower() df.loc[pool_address, r['tokenAddress']] = r['amount'] * week_passed if len(df) == 0: LOGGER.info('No incentives for this chain') continue df.fillna(0, inplace=True) df.index.name = 'pool_address' return df, week_passed
def clean_financial_data(df): """ Cleans the data for further processing. * param: df - Dataframe. * returns: Cleaned dataframe. """ LOGGER.info("Cleaning the financial data.") df_slice = df[['Identifier (RIC)', 'Company Name', 'Date of Insolvency', 'Score', 'Discrimination']] df = df.drop(['Identifier (RIC)', 'Company Name', 'Date of Insolvency', 'Score', 'Discrimination'], axis=1) columns = df.columns def convert_string_to_float(x): for col in columns: float(str(x[col]).replace(",", "").replace( ' - ', str(0)).replace("%", "")) / 1000000 return x df.replace(to_replace='#DIV/0!', value=0, inplace=True) df = df.apply(convert_string_to_float, axis=1) return pd.concat([df, df_slice], sort=False)
def __next__(self): try: name, dataframe = next(self.groupby_iter) marketData = read_market_data(name) financial_data = clean_financial_data(dataframe) return merge_f_m_data(financial_data, marketData) except Exception as ex: LOGGER.error(f"Error occurred. {ex}. Skipping.")
def fit(name: str, model: Estimator, params: t.Mapping[str, float], filename: Path, X: pd.DataFrame, y: pd.Series, /) -> None: LOGGER.info(f'Run {name}') m = (make_pipeline(model) if params is None else GridSearchCV( estimator=make_pipeline(model), param_grid=params, scoring='neg_root_mean_squared_error', n_jobs=3, cv=5)).fit(X, y) joblib.dump(value=m if params is None else m.best_estimator_, filename=filename)
def apply_redirects(_data, _realtime=None, _redirects=None): if _redirects: redirects = _redirects else: redirects = get_redirects(_realtime) result = pd.DataFrame(_data).reset_index() result['lp_address'] = result['lp_address'].apply(Web3.toChecksumAddress) n = len(result['lp_address'][result['lp_address'].isin(redirects.keys())]) LOGGER.debug(f'{n} redirectors found amongst the recipients') result['lp_address'] = result['lp_address'].apply( lambda x: redirects.get(x, x)) result = result.groupby('lp_address').sum() return result, n
def compute_timestamp_intervals(_blocks, _realtime=None): LOGGER.debug('compute_timestamp_intervals') blocks = pd.Series(_blocks).drop_duplicates().sort_values().values intervals = pd.Series(blocks, index=blocks).diff().shift(-1) if _realtime: intervals.iloc[-1] = datetime.datetime.utcnow() - intervals.index[-1] else: intervals.iloc[-1] = intervals.index[0] + \ datetime.timedelta(days=7) - intervals.index[-1] intervals = intervals.dt.total_seconds() intervals.name = 'state_duration' intervals.index.name = 'block_timestamp' return intervals
def read_market_data(name): """ Gets the market data from the data/market_data directory. param: name - Name of the company for which we want the data. return: Dataframe """ parentDirectory = os.path.abspath(os.getcwd()) try: LOGGER.info(f"Getting market data for {name}.") df = pd.read_csv( os.path.join(parentDirectory, 'data/market_data', f'{name}.csv')) return df except Exception as ex: LOGGER.error(f"An error occurred when fetching data for {name}, {ex}")
def enrich_data(data): """ Enhances data frame with information on indicators and price patterns. Indicators and patterns are align so that they represent details for previous minute. :param data: DataFrame :return: DataFrame """ LOGGER.info("Adding TA-Lib indicators as features...") # We specifically do shifting here so that all additional data represents information about past history. df = pd.concat( (data, get_indicators(data).shift(), get_price_patterns(data).shift()), axis=1) df = df.fillna(method='ffill') return df.dropna()
def train(model, epoch, optimizer, grad, generator): """ A main training function for training the models. Arguments: * model- Model to train. * epochs- number of epochs to train the model * optimizer- optimizer to be used. * grad- gradient function which computes gradient for the model layers. * generator- training dataset on which model is trained. """ # Keep results for plotting train_loss_results = [] train_accuracy_results = [] # for epoch in range(epochs): epoch_loss_avg = tf.keras.metrics.Mean() epoch_accuracy = tf.keras.metrics.BinaryAccuracy() # Training loop - using batches of 1 for i in range(len(generator)): x, y, timeStep = generator.get_next_step() x = x.reshape(1, 1, x.shape[0]) y = y.reshape(1, 1) timeStep = timeStep.reshape(1, 1) # Optimize the model loss_value, grads = grad(model, x, y, i, timeStep) optimizer.apply_gradients(zip(grads, model.trainable_variables)) # Track progress epoch_loss_avg(loss_value) # Add current batch loss # Compare predicted label to actual label # training=True is needed only if there are layers with different # behavior during training versus inference (e.g. Dropout). epoch_accuracy(y, model(x, training=True)) # End epoch train_loss_results.append(epoch_loss_avg.result()) train_accuracy_results.append(epoch_accuracy.result()) LOGGER.info("Epoch {:03d}: Loss: {:.3f}, Accuracy: {:.3%}".format( epoch, epoch_loss_avg.result(), epoch_accuracy.result())) return train_loss_results, train_accuracy_results
def run(self, validate=False): best_f1 = 0 for epoch in range(self.epochs): start_time = time.time() avg_loss = self.run_train_epoch() if validate: valid_preds, avg_val_loss, val_f1 = self.run_validation_epoch() elapsed_time = time.time() - start_time LOGGER.info('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f} \t val_f1={:.4f} \t time={:.2f}s'.format( epoch + 1, self.epochs, avg_loss, avg_val_loss, val_f1, elapsed_time)) if val_f1 > best_f1: self.save_model() best_f1 = val_f1 else: elapsed_time = time.time() - start_time LOGGER.info('Epoch {}/{} \t loss={:.4f} \t time={:.2f}s'.format( epoch + 1, self.epochs, avg_loss, elapsed_time)) self.save_model() self.scheduler.step() if not validate or val_f1 != best_f1: self.load_model() valid_preds, avg_val_loss, val_f1 = self.run_validation_epoch() elapsed_time = time.time() - start_time LOGGER.info('Fin {}epochs \t loss={:.4f} \t val_loss={:.4f} \t val_f1={:.4f} \t time={:.2f}s'.format( self.epochs, avg_loss, avg_val_loss, val_f1, elapsed_time)) test_preds = self.run_test_epoch() return valid_preds, test_preds
def query_gbq(_network, _week_number, _pool_list, _excluded_lps_list=[]): LOGGER.debug('query_gbq') _excluded_lps_list = list(set(_excluded_lps_list + BASE_LP_EXCLUSION_LIST)) with open(SQL_FILE_PATH, 'r') as file: sql = file.read() _days_in_week = '3' sql = sql.format( week_number=_week_number, pool_addresses="','".join(_pool_list), blocks_table=TABLES_CONFIGS[_network]['blocks'], lm_transfers_table=TABLES_CONFIGS[_network]['lm_transfers'], lm_state_table=TABLES_CONFIGS[_network]['lm_state'], excluded_lps="','".join(_excluded_lps_list), days_in_week=_days_in_week) client = bigquery.Client() bqstorageclient = bigquery_storage.BigQueryReadClient() df = (client.query(sql).result().to_dataframe( bqstorage_client=bqstorageclient)) df = df.groupby(['pool_address', 'lp_address', 'block_timestamp']).sum() return df
def update_gbq_api_dataset(_full_export, _week_number): # zero previous week's velocity prev_week = _week_number - 1 LOGGER.info(f'Zeroing velocity for week {prev_week}') sql = f''' UPDATE {PROJECT_ID}.bal_mining_estimates.lp_estimates_multitoken SET velocity = '0' WHERE week = {prev_week} ''' client = bigquery.Client() query = client.query(sql) query.result() LOGGER.info('Done') _full_export.reset_index(inplace=True) LOGGER.info('Saving estimates to staging...') _full_export.to_gbq('bal_mining_estimates.lp_estimates_multitoken_staging', project_id=PROJECT_ID, if_exists='replace') LOGGER.info('Done') # merge staging into prod sql = ''' MERGE bal_mining_estimates.lp_estimates_multitoken prod USING bal_mining_estimates.lp_estimates_multitoken_staging stage ON prod.address = stage.address AND prod.week = stage.week AND prod.chain_id = stage.chain_id AND prod.token_address = stage.token_address WHEN MATCHED THEN UPDATE SET earned = stage.earned, velocity = stage.velocity, timestamp = stage.timestamp WHEN NOT MATCHED BY TARGET THEN INSERT (address, week, chain_id, token_address, earned, velocity, timestamp) VALUES (address, week, chain_id, token_address, earned, velocity, timestamp) ''' client = bigquery.Client() query = client.query(sql) query.result()
def save_report(_week_number, _chain_id, _token, _data): LOGGER.debug(f'saving {_token} report...') network = NETWORKS[_chain_id] reports_dir = f'reports/{_week_number}' if not os.path.exists(reports_dir): os.mkdir(reports_dir) filename = f'{reports_dir}/__{network}_{_token}.json' export_data = _data[_data > get_claim_threshold(_token)] export = export_data.apply( lambda x: format(x, f'.{get_claim_precision(_token)}f')) export_json = export.to_json() parsed_export = json.loads(export_json) with open(filename, "w") as write_file: json.dump(parsed_export, write_file, indent=4) LOGGER.debug(f'saved to {filename}') if _chain_id == 1 and _token == '0xba100000625a3754423978a60c9317c58a424e3d': filename = f'{reports_dir}/_totals.json' with open(filename, "w") as write_file: json.dump(parsed_export, write_file, indent=4) LOGGER.debug(f'saved to {filename}')
def compute_LM_power_timeseries(_df): LOGGER.debug('compute_LM_power_timeseries') df = _df.copy() df['lm_power'] = df.sort_index().groupby(['lp_address']).cumsum() df = df.drop(columns=['delta']) df = df.clip(lower=0) lp_addresses_list = (df.index.get_level_values( 'lp_address').drop_duplicates().sort_values().values) block_timestamps_list = (df.index.get_level_values( 'block_timestamp').drop_duplicates().sort_values().values) levels = [lp_addresses_list, block_timestamps_list] new_index = pd.MultiIndex.from_product( levels, names=['lp_address', 'block_timestamp'], ) df = df.tz_localize(None, level='block_timestamp') LOGGER.debug('reindexing ({})...'.format(len(block_timestamps_list))) df = df.reindex(index=new_index) df.loc(axis=0)[:, block_timestamps_list[0]] = df.loc( axis=0)[:, block_timestamps_list[0]].fillna(0) df = df.fillna(method='pad') LOGGER.debug('done') return df
from sklearn.pipeline import make_pipeline from pandas_profiling import ProfileReport from pathlib import Path p = Path(__file__).parents[1] # To load project modules import sys; sys.path.append(str(p)) from src.logger import LOGGER from src import estimators as e from src.ranker import Ranker A4_DIMS = (11.7, 8.27) LOGGER.info('Load data') df = pd.read_pickle(p.joinpath('data', 'interim', 'research.pkl')) X = df.drop(labels='loss', axis=1) y = df['loss'].copy() LOGGER.info('Process target') y = pd.Series(data=power_transform(y.values.reshape(-1, 1)).flatten(), name='loss', index=y.index) LOGGER.info('Load categorical features to drop') noVarFeatures = json.load(open(file=p.joinpath('src', 'meta', 'NoVariance.json'), mode='r')) LOGGER.info('Process categorical features') catf = pd.DataFrame( data=make_pipeline( e.CategoricalGrouper(), e.CategoricalEncoder()
import pandas as pd import numpy as np from sklearn.preprocessing import (power_transform, quantile_transform, scale, StandardScaler, FunctionTransformer) from sklearn.pipeline import make_pipeline from pathlib import Path p = Path(__file__).parents[1] # To load project modules import sys sys.path.append(str(p)) from src.logger import LOGGER from src import estimators as e LOGGER.info('Load data') df = pd.read_pickle(p.joinpath('data', 'interim', 'research.pkl')) X = df.drop(labels='loss', axis=1) y = df['loss'].copy() LOGGER.info('Process target') y = pd.Series(data=power_transform(y.values.reshape(-1, 1)).flatten(), name='loss', index=y.index) LOGGER.info('Load categorical features to drop') noVarFeatures = json.load( open(file=p.joinpath('src', 'meta', 'NoVariance.json'), mode='r')) LOGGER.info('Process categorical features') catf = pd.DataFrame(data=make_pipeline(
data=pca.components_, columns=cols, index=[f'Component {c+1}' for c in range(len(cols))]), cmap='RdBu', annot=True, fmt='.2f', cbar=False) ax.set(title=title) plt.tight_layout() fig.savefig(fname=p.joinpath('reports', 'figures', out), dpi=800, format='png') plt.close(fig=fig) LOGGER.info('Load correlated features') CORRELATED = json.load( open(file=p.joinpath('src', 'meta', 'Correlated.json'), mode='r')) LOGGER.info('Load data') df = pd.read_pickle(p.joinpath('data', 'interim', 'research.pkl')) X = df.filter(CORRELATED) y = df['loss'].copy() LOGGER.info('Process target') y = pd.Series(data=power_transform(y.values.reshape(-1, 1)).flatten(), name='loss', index=y.index) LOGGER.info('Process categorical features') catf = pd.DataFrame(data=make_pipeline(e.CategoricalGrouper(),
'--rt', help='pass 1 to run realtime estimator; ignores week parameter', type=int) parser.add_argument( '--logs', help='{WARNING, DEBUG, INFO}', type=str) args = parser.parse_args() # initialize the logger if args.logs: # if realtime = True log_level = args.logs.upper() logger_init(log_level) if args.rt: # if realtime = True LOGGER.info('Running realtime estimator') week_number = this_week realtime_estimator = True if args.week: LOGGER.warning('Running realtime estimator, ignoring week parameter') elif args.week: week_number = args.week if week_number > this_week: exit( f'Error: week {week_number} is in the future, this is week {this_week}') if week_number == this_week: LOGGER.warning( f'Warning: week {week_number} is not over yet. Did you mean to run the realtime estimator?') # for each network
import pandas as pd from pandas_profiling import ProfileReport from pathlib import Path p = Path(__file__).parents[1] # To load project modules import sys sys.path.append(str(p)) from src.logger import LOGGER LOGGER.info('Load data') res = pd.read_pickle(p.joinpath('data', 'interim', 'research.pkl')) dev = pd.read_pickle(p.joinpath('data', 'interim', 'development.pkl')) LOGGER.info('Profile on research') (ProfileReport(df=res, config_file=p.joinpath('src', 'ProfileConf.yml'), title='Research dataset', dataset={ 'description': r'Research dataset (10% of total data)' }).to_file(p.joinpath('reports', 'profiles', 'research.html'))) LOGGER.info('Profile on development') (ProfileReport(df=dev, config_file=p.joinpath('src', 'ProfileConf.yml'), title='Development dataset', dataset={ 'description': r'Development dataset (10% of total data)' }).to_file(p.joinpath('reports', 'profiles',
sys.path.append(str(p)) from src.logger import LOGGER from src import preprocessors as pp from src.ranker import Ranker CROSS_VAL_OPTS = { 'scoring': 'neg_root_mean_squared_error', 'cv': RepeatedKFold(n_splits=5, n_repeats=20), 'n_jobs': 3, 'return_train_score': False, 'return_estimator': False, } r = Ridge(random_state=0, solver='saga', max_iter=1e5) LOGGER.info('Load data') df = pd.read_pickle(p.joinpath('data', 'interim', 'research.pkl')) X = df.drop(labels='loss', axis=1) y = df['loss'].copy() LOGGER.info('Process target') tp = pp.TargetPreprocessor( ) # will be needed to convert back to y original units y = tp.fit_transform(y) LOGGER.info('Process X') X = pp.Preprocessor().fit_transform(X, y) LOGGER.info('Prepare dummy regressor') dummyRMSE = mean_squared_error(y_true=y, y_pred=DummyRegressor(strategy='mean').fit(
import pandas as pd import numpy as np from pathlib import Path p = Path(__file__).parents[1] # To load project modules import sys; sys.path.append(str(p)) from src.logger import LOGGER from src import preprocessors as pp from src.ranker import Ranker LOGGER.info('Load data') df = pd.read_pickle(p.joinpath('data', 'interim', 'research.pkl')) X = df.drop(labels='loss', axis=1) y = df['loss'].copy() LOGGER.info('Process target') y = pp.TargetPreprocessor().fit_transform(y) LOGGER.info('Process X') X = pp.Preprocessor().fit_transform(X, y) LOGGER.info('Create ranking') rnk = Ranker().rank(X, y) rnk['Rank'] = rnk['Association Strength'].abs().rank(ascending=False) rnk['Group'] = np.ceil(rnk['Rank'].div(5)) rnk.to_pickle(p.joinpath('src', 'meta', 'Ranking.pkl'))
import json import pandas as pd from sklearn.pipeline import make_pipeline from sklearn.feature_selection import VarianceThreshold from pathlib import Path p = Path(__file__).parents[1] # To load project modules import sys sys.path.append(str(p)) from src.logger import LOGGER from src import estimators as e LOGGER.info('Load data') df = pd.read_pickle(p.joinpath('data', 'interim', 'research.pkl')) X = df.filter(like='cat') y = df['loss'].copy() LOGGER.info('Process data') X = pd.DataFrame(data=make_pipeline(e.CategoricalGrouper(), e.CategoricalEncoder()).fit_transform( X, y), columns=X.columns, index=X.index) LOGGER.info('Variance threshold analysis') vt = VarianceThreshold().fit(X) drop = [col for col in X if col not in X.columns[vt.get_support()]]
# Setup to load modules in `src` import sys sys.path.append(str(p)) from src.logger import LOGGER from src.preprocessors import TargetPreprocessor CROSS_VAL_OPTS = { 'scoring': 'neg_root_mean_squared_error', 'cv': RepeatedKFold(n_splits=5, n_repeats=20), 'n_jobs': 3, 'return_train_score': False, 'return_estimator': False, } LOGGER.info('Load data') df = pd.read_pickle(p.joinpath('data', 'interim', 'development.pkl')) LOGGER.info('Load selected features') FEATURES = json.load( open(file=p.joinpath('src', 'meta', 'SelectedFeatures.json'), mode='r')) LOGGER.info('Get X and y') X = df.filter(FEATURES) y = df['loss'].copy() LOGGER.info('Process target') tp = TargetPreprocessor() # will be needed to convert back to y original units y = tp.fit_transform(y) LOGGER.info('Load models')
def save_model(self): with open(self.model_path, "wb") as fout: torch.save(self.model.state_dict(), fout) LOGGER.info("Model was saved in {}".format(self.model_path))
import pandas as pd from pathlib import Path p = Path(__file__).parents[1] # To load project modules import sys sys.path.append(str(p)) from src.logger import LOGGER from src import preprocessors as pp from pandas_profiling import ProfileReport LOGGER.info('Load data') df = pd.read_pickle(p.joinpath('data', 'interim', 'research.pkl')) X = df.drop(labels='loss', axis=1) y = df['loss'].copy() LOGGER.info('Process target') y = pp.TargetPreprocessor().fit_transform(y) LOGGER.info('Process X') X = pp.Preprocessor().fit_transform(X, y) LOGGER.info('Profile final data') ProfileReport(df=X.join(y), config_file=p.joinpath('src', 'ProfileConf.yml'), title='Final dataset').to_file( p.joinpath('reports', 'profiles', 'final.html'))
def merge_f_m_data(financial_df, market_df): """ Merge both dataframe to get the final data. The shape of the financal df will be (2*586) while market df will be of (2000*13) returns the merged data. """ market_df['Date'] = pd.to_datetime(market_df['Date']) columns = financial_df.columns transposed_df = financial_df.T transposed_df['Row names'] = columns transposed_df['Date'] = 0 date_of_solvency = financial_df['Date of Insolvency'] # here 1 = Defaulted, 0 = Non-default default_score = financial_df['Score'] LOGGER.info('Merging market data and financial data into one.') def get_dates(x): # Gives date to the FY-X term. for key in DATE_MAP.keys(): if pd.to_datetime(f'1/1/{DATE_MAP[key]}').dayofweek == 5: date = f'1/3/{DATE_MAP[key]}' elif pd.to_datetime(f'1/1/{DATE_MAP[key]}').dayofweek == 6: date = f'1/2/{DATE_MAP[key]}' else: date = f'1/1/{DATE_MAP[key]}' if x['Row names'].find(key) >= 0: x['Date'] = date return x transposed_df = transposed_df.apply(get_dates, axis=1) transposed_df['Date'] = transposed_df['Date'].replace(0, '1/1/2019') transposed_df['Date'] = pd.to_datetime(transposed_df['Date']) transposed_df = transposed_df.fillna(method='bfill') transposed_df = transposed_df.loc[:, ~transposed_df.columns.duplicated()] required_cols = ['Row names', 'Date'] transposed_df_cols = transposed_df.columns for col in transposed_df_cols: if col not in required_cols: transposed_df.rename(columns = {col : 'Data'}, inplace=True) added_cols = [] final_data = {} for index, row in transposed_df.iterrows(): last_added_col = None if len(added_cols) == 0 else added_cols[-1] if len(added_cols) > 0 and row['Row names'].find(last_added_col) >= 0: try: final_data[last_added_col].append(row['Data']) final_data['Date'].append(row['Date']) except KeyError: final_data[last_added_col] = [row['Data']] elif len(added_cols) == 0: added_cols.append(row['Row names']) final_data[row['Row names']] = [row['Data']] final_data['Date'] = [row['Date']] elif str(row['Row names']).find(added_cols[-1]) < 0: added_cols.append(row['Row names']) final_data[row['Row names']] = [row['Data']] final_data['Date'] = final_data['Date'][0:15] cleaned_f_df = pd.DataFrame( dict([(k, pd.Series(v)) for k, v in final_data.items()])) merged_df = market_df.merge(cleaned_f_df, on='Date', how='outer') merged_df = merged_df.drop(['Identifier (RIC)', 'Company Name', 'Date of Insolvency', 'Score', 'Discrimination', 'Z Score'], axis=1) merged_df = merged_df.sort_values(by='Date', ascending=False) merged_df = merged_df.fillna(method='bfill').fillna(method='ffill') return (merged_df, date_of_solvency, default_score)
def fit(name: str, model: Estimator, params: t.Mapping[str, float], filename: Path, X: pd.DataFrame, y: pd.Series, /) -> None: LOGGER.info(f'Run {name}') m = (make_pipeline(model) if params is None else GridSearchCV( estimator=make_pipeline(model), param_grid=params, scoring='neg_root_mean_squared_error', n_jobs=3, cv=5)).fit(X, y) joblib.dump(value=m if params is None else m.best_estimator_, filename=filename) LOGGER.info('Load data') df = pd.read_pickle(p.joinpath('data', 'interim', 'development.pkl')) LOGGER.info('Load selected features') FEATURES = json.load( open(file=p.joinpath('src', 'meta', 'SelectedFeatures.json'), mode='r')) LOGGER.info('Get X and y') X = df.filter(FEATURES) y = df['loss'].copy() LOGGER.info('Process target') y = TargetPreprocessor().fit_transform(y) # Fitting for name, model, params in MODELS: