def feature_selection( model_id: str, method: str, split: Optional[float] = 0.7, batch: Optional[str] = None, task_key: Optional[str] = None, sync: Optional[bool] = False, model_service: ModelService = Depends(ModelService), service: FeatureSelectionService = Depends(FeatureSelectionService), tasks: TaskService = Depends(TaskService)): try: model = model_service.get_model(model_id) mf = service.create_features_search(model, split, method, task_key=task_key) if sync: return service.feature_selection(model, mf, sync=True) return tasks.send(task_name='featureselection', task_args={ 'model': model.dict(), 'search_parameters': mf.dict() }, name='feature_selection-{}-{}-{}-{}'.format( model.symbol, model.pipeline, model.dataset, model.target), batch=batch) except MessageException as e: raise HTTPException(status_code=400, detail=e.message)
def grid_search(model_id: str, split: Optional[float] = 0.7, batch: Optional[str] = None, task_key: Optional[str] = None, sync: Optional[bool] = False, model_service: ModelService = Depends(ModelService), service: GridSearchService = Depends(GridSearchService), tasks: TaskService = Depends(TaskService)): try: model = model_service.get_model(model_id) parameters = service.create_parameters_search(model, split, task_key=task_key) if sync: return service.grid_search(model, parameters, sync=True) return tasks.send(task_name='gridsearch', task_args={ 'model': model.dict(), 'search_parameters': parameters.dict() }, name='grid_search-{}-{}-{}-{}'.format( model.symbol, model.pipeline, model.dataset, model.target), batch=batch) except MessageException as e: raise HTTPException(status_code=400, detail=e.message)
def main(dataset: str, target: str, pipeline: str): shapes = [] ds_service = DatasetService() m_service = ModelService() for symbol in SYMBOLS: print(f"Exporting shap dataframes for symbol {symbol}") ds = ds_service.get_dataset(name=dataset, symbol=symbol) fs = DatasetService.get_feature_selection(ds=ds, method='importances_shap', target=target) X_all = ds_service.get_dataset_features(ds=ds, columns=fs.features) y_all = ds_service.get_dataset_target(ds=ds, name=target) model = m_service.get_model(pipeline=pipeline, dataset=dataset, target=target, symbol=symbol) for t in model.tests: print(f"Loading estimators for test {t.window}") estimators = ModelService.load_test_estimators(model=model, mt=t) shaps = [] print(f"Calculating shap values...") for est in tqdm(estimators): est_class = y_all.loc[est.day] shap_v, shap_exp = get_shap_values(estimator=est, X=X_all.loc[est.day], X_train=est.train_x, bytes=False) df = pd.DataFrame([shap_v], index=[pd.to_datetime(est.day)], columns=X_all.columns) df['label'] = y_all.loc[est.day] df['shap_expected'] = shap_exp shaps.append(df) print("Exporting dataframe..") cdf = pd.concat(shaps, axis='index') os.makedirs(f"data/shap_values/{dataset}/{target}/{pipeline}/", exist_ok=True) cdf.to_csv( f"data/shap_values/{dataset}/{target}/{pipeline}/shap_test_{symbol}_Wdays{t.window['days']}.csv", index_label='time') print("Exported.") # # Load day estimator # est = load_estimator() print(f"Plotted {symbol}")
def test_model(model_id: str, sync: Optional[bool] = False, test: ModelTest = Body(...), tasks: TaskService = Depends(TaskService), service: ModelService = Depends(ModelService)): try: model = service.get_model(model_id) if sync: return service.test_model(model, test) return tasks.send(task_name='testmodel', task_args={ 'model': model.dict(), 'test': test.dict() }, name='model_test-{}-{}-{}-{}'.format( model.symbol, model.pipeline, model.dataset, model.target)) except MessageException as e: raise HTTPException(status_code=400, detail=e.message)
class GridSearchService: def __init__(self): self.model_repo = ModelRepository() self.model_service = ModelService() self.dataset_service = DatasetService() def create_parameters_search(self, model: Model, split: float, **kwargs) -> ModelParameters: ds = self.dataset_service.get_dataset(model.dataset, model.symbol) splits = DatasetService.get_train_test_split_indices(ds, split) # Features can either be a list of features to use, or a string # If it is a string, and it is "latest", pick the latest features = kwargs.get('features') # if isinstance(features, str) and features == 'latest': # if model.features: # features = model.features[-1].features # else: # features = None if features: target = kwargs.get('target', 'class') mf = DatasetService.get_feature_selection( ds=ds, method=kwargs.get('features'), target=target) if not mf: raise MessageException( f"Feature selection not found for {model.dataset}.{model.symbol} -> {target}!" ) features = mf.features # Determine K for K-fold cross validation based on dataset's sample count # Train-test split for each fold is 80% train, the lowest training window for accurate results is 30 samples # so we need X samples where X is given by the proportion: # 30/0.8 = X/1; X= 30/0.8 = 37.5 ~ 40 samples per fold X = 40 k = 5 # If samples per fold with 5-fold CV are too low, use 3-folds if ds.count / k < X: k = 3 # If samples are still too low, raise a value error if ds.count / k < X and not kwargs.get("permissive"): raise ValueError("Not enough samples to perform cross validation!") result = ModelParameters(cv_interval=splits['train'], cv_splits=k, task_key=kwargs.get('task_key', str(uuid4())), features=features or None) return result def _get_dataset_and_pipeline(self, model: Model, mp: ModelParameters, **kwargs): if not model.id: # Make sure the task exists model = self.model_repo.create(model) if self.model_repo.exist_parameters(model.id, mp.task_key): logging.info("Model {} Grid search {} already executed!".format( model.id, mp.task_key)) return mp # Load dataset X = self.dataset_service.get_features(model.dataset, model.symbol, mp.cv_interval.begin, mp.cv_interval.end, columns=mp.features) y = self.dataset_service.get_target(model.target, model.symbol, mp.cv_interval.begin, mp.cv_interval.end) unique, counts = np.unique(y, return_counts=True) if len(unique) < 2: logging.error( "[{}-{}-{}-{}]Training data contains less than 2 classes: {}". format(model.symbol, model.dataset, model.target, model.pipeline, unique)) raise MessageException( "Training data contains less than 2 classes: {}".format( unique)) logging.info("Dataset loaded: X {} y {} (unique: {})".format( X.shape, y.shape, unique)) # Load pipeline pipeline_module = get_pipeline(model.pipeline) return pipeline_module, X, y def grid_search(self, model: Model, mp: ModelParameters, **kwargs) -> ModelParameters: pipeline_module, X, y = self._get_dataset_and_pipeline(model, mp) tag = "{}-{}-{}-{}-{}" \ .format(model.symbol, model.dataset, model.target, model.pipeline, dict_hash(mp.parameters)) # Perform search if not kwargs.get('halving'): gscv = GridSearchCV( estimator=pipeline_module.estimator, param_grid=kwargs.get('parameter_grid', pipeline_module.PARAMETER_GRID), # cv=BlockingTimeSeriesSplit(n_splits=mp.cv_splits), cv=StratifiedKFold(n_splits=mp.cv_splits), scoring=get_precision_scorer(), verbose=kwargs.get("verbose", 0), n_jobs=kwargs.get("n_jobs", None), refit=False) else: gscv = HalvingGridSearchCV( estimator=pipeline_module.estimator, param_grid=kwargs.get('parameter_grid', pipeline_module.PARAMETER_GRID), factor=2, cv=BlockingTimeSeriesSplit(n_splits=mp.cv_splits), scoring=get_precision_scorer(), verbose=kwargs.get("verbose", 0), n_jobs=kwargs.get("n_jobs", cpu_count() / 2), refit=False, random_state=0) try: mp.start_at = get_timestamp() # Log starting timestamp gscv.fit(X, y) mp.end_at = get_timestamp() # Log ending timestamp except SplitException as e: logging.exception( "Model {} splitting yields single-class folds!\n{}".format( tag, e.message)) return mp # Fit failed, don't save this. except ValueError as e: logging.exception("Model {} raised ValueError!\n{}".format(tag, e)) return mp # Fit failed, don't save this. # Collect results results_df = pd.DataFrame(gscv.cv_results_) # Update search request with results mp.parameter_search_method = 'halving_grid_search' if kwargs.get( 'halving') else 'gridsearch' mp.parameters = gscv.best_params_ mp.cv_results = results_df.to_dict() mp.result_file = 'cv_results-{}.csv'.format(tag) # Save grid search results on storage if kwargs.get('save', True): storage_service.upload_json_obj(mp.parameters, 'grid-search-results', 'parameters-{}.json'.format(tag)) storage_service.save_df(results_df, 'grid-search-results', mp.result_file) # Update model with the new results self.model_repo.append_parameters(model.id, mp) return mp def random_search(self, model: Model, mp: ModelParameters, **kwargs) -> ModelParameters: pipeline_module, X, y = self._get_dataset_and_pipeline(model, mp) tag = "{}-{}-{}-{}-{}" \ .format(model.symbol, model.dataset, model.target, model.pipeline, dict_hash(mp.parameters)) rscv = RandomizedSearchCV(estimator=pipeline_module.estimator, param_distributions=kwargs.get( 'param_distributions', pipeline_module.PARAMETER_DISTRIBUTION), n_iter=kwargs.get('n_iter', 10), cv=StratifiedKFold(n_splits=mp.cv_splits), scoring=get_precision_scorer(), verbose=kwargs.get("verbose", 0), n_jobs=kwargs.get("n_jobs", None), refit=False, random_state=0) try: mp.start_at = get_timestamp() # Log starting timestamp rscv.fit(X, y) mp.end_at = get_timestamp() # Log ending timestamp except SplitException as e: logging.exception( "Model {} splitting yields single-class folds!\n{}".format( tag, e.message)) return mp # Fit failed, don't save this. except ValueError as e: logging.exception("Model {} raised ValueError!\n{}".format(tag, e)) return mp # Fit failed, don't save this. # Collect results results_df = pd.DataFrame(rscv.cv_results_) # Update search request with results mp.parameter_search_method = 'randomsearch' mp.parameters = rscv.best_params_ mp.result_file = 'cv_results-{}.csv'.format(tag) # Save grid search results on storage if kwargs.get('save', True): storage_service.upload_json_obj(mp.parameters, 'random-search-results', 'parameters-{}.json'.format(tag)) storage_service.save_df(results_df, 'random-search-results', mp.result_file) # Update model with the new results self.model_repo.append_parameters(model.id, mp) return mp def grid_search_new(self, symbol: str, dataset: str, target: str, pipeline: str, split: float, feature_selection_method: str, **kwargs): # Check if a model exists and has same search method existing_model = self.model_service.get_model(pipeline=pipeline, dataset=dataset, target=target, symbol=symbol) if existing_model: mp_exists = ModelService.get_model_parameters(existing_model, method='gridsearch') if mp_exists: if kwargs.get('replace'): self.model_service.remove_parameters(model=existing_model, method='gridsearch') else: if kwargs.get('save'): raise MessageException( f"Grid search already performed for {pipeline}({dataset}.{symbol}) -> {target}" ) # Retrieve dataset to use ds = self.dataset_service.get_dataset(dataset, symbol) # Determine cv_splits=K for K-fold cross validation based on dataset's sample count # Train-test split for each fold is 80% train, the lowest training window for accurate results is 30 samples # so we need X samples where X is given by the proportion: # 30/0.8 = X/1; X= 30/0.8 = 37.5 ~ 40 samples per fold X = 40 cv_splits = 5 # If samples per fold with 5-fold CV are too low, use 3-folds if ds.count / cv_splits < X: cv_splits = 3 # If samples are still too low, raise a value error if ds.count / cv_splits < X and not kwargs.get("permissive"): raise ValueError("Not enough samples to perform cross validation!") # Determine split indices based on dataset splits = DatasetService.get_train_test_split_indices(ds, split) cv_interval = splits['train'] # Load dataset features by applying a specified feature selection method X = self.dataset_service.get_dataset_features( ds=ds, begin=cv_interval['begin'], end=cv_interval['end'], method=feature_selection_method, target=target) y = self.dataset_service.get_target( name=target, symbol=symbol, begin=cv_interval['begin'], end=cv_interval['end'], ) # Check number of samples for each class in training data, if less than 3 instances are present for # each class, we're going to get a very unstable model (or no model at all for k-NN based algos) unique, counts = np.unique(y, return_counts=True) if len(unique) < 2: logging.error( "[{}-{}-{}-{}]Training data contains less than 2 classes: {}". format(symbol, dataset, target, pipeline, unique)) raise MessageException( "Training data contains less than 2 classes: {}".format( unique)) logging.info("Dataset loaded: X {} y {} (unique: {})".format( X.shape, y.shape, unique)) # Load pipeline algorithm and parameter grid pipeline_module = get_pipeline(pipeline) # Perform search gscv = GridSearchCV( estimator=pipeline_module.estimator, param_grid=kwargs.get('parameter_grid', pipeline_module.PARAMETER_GRID), # cv=BlockingTimeSeriesSplit(n_splits=mp.cv_splits), cv=StratifiedKFold(n_splits=cv_splits), scoring=get_precision_scorer(), verbose=kwargs.get("verbose", 0), n_jobs=kwargs.get("n_jobs", None), refit=False) mp = ModelParameters(cv_interval=splits['train'], cv_splits=cv_splits, task_key=kwargs.get('task_key', str(uuid4())), features=[c for c in X.columns], parameter_search_method='gridsearch') mp.start_at = get_timestamp() gscv.fit(X, y) mp.end_at = get_timestamp() # Collect results results_df = pd.DataFrame(gscv.cv_results_) mp.parameters = gscv.best_params_ mp.cv_results = results_df.loc[:, results_df.columns != 'params'].to_dict( 'records') tag = "{}-{}-{}-{}-{}".format(symbol, dataset, target, pipeline, dict_hash(mp.parameters)) mp.result_file = 'cv_results-{}.csv'.format(tag) # Is there an existing model for this search? model = Model(pipeline=pipeline, dataset=dataset, target=target, symbol=symbol, features=feature_selection_method) model.parameters.append(mp) self.model_repo.create(model) # Save grid search results on storage if kwargs.get('save', True): storage_service.upload_json_obj(mp.parameters, 'grid-search-results', 'parameters-{}.json'.format(tag)) storage_service.save_df(results_df, 'grid-search-results', mp.result_file) return mp
def main(dataset: str, target: str): num_shap_plots = 3 shap_show_count = 10 ds_service = DatasetService() m_service = ModelService() for pipeline in PIPELINES: for symbol in SYMBOLS: print( f"Plotting shap dataframes for pipeline {pipeline} symbol {symbol}" ) ds = ds_service.get_dataset(name=dataset, symbol=symbol) fs = DatasetService.get_feature_selection( ds=ds, method='importances_shap', target=target) X_all = ds_service.get_dataset_features(ds=ds, columns=fs.features) y_all = ds_service.get_dataset_target(ds=ds, name=target) model = m_service.get_model(pipeline=pipeline, dataset=dataset, target=target, symbol=symbol) for t in model.tests: placeholder = "{label}" csv_name = f"data/shap_values/{dataset}/{target}/{pipeline}/shap_training_window_{symbol}_{placeholder}_Wdays{t.window['days']}_.csv" expected_csv_name = csv_name.format(label='SHAP_expected') print(f"Loading results for test {t.window}") results = ModelService.parse_test_results(test=t) exp_shap_df = pd.read_csv(expected_csv_name, index_col='time', parse_dates=True) for cls, label in enumerate(["SELL", "HOLD", "BUY"]): class_csv_name = csv_name.format(label=label) cls_shap_df = pd.read_csv(class_csv_name, index_col='time', parse_dates=True) cls_shap_df = cls_shap_df.loc[t.test_interval.begin:t. test_interval.end] x_train = X_all.loc[cls_shap_df.index] chunk_size = int(cls_shap_df.shape[0] / num_shap_plots) fig = plt.figure(constrained_layout=True, figsize=(100, 50), dpi=300) # gs = GridSpec(3, num_shap_plots, figure=fig, wspace=1.5, hspace=0.3) precision_ax = fig.add_subplot(gs[0, :]) shap_values_ax = fig.add_subplot(gs[1, :]) beeswarms_axs = [ fig.add_subplot(gs[2, i]) for i in range(num_shap_plots) ] #format_axes(fig) shap_plot_labels = set() first_shap_day = results.iloc[0]['time'].replace( '+00:00', '').replace('T', '').replace(':', '').replace('-', '') middle_shap_day = results.iloc[int( results.shape[0] / 2)]['time'].replace( '+00:00', '').replace('T', '').replace(':', '').replace('-', '') last_shap_day = results.iloc[-1]['time'].replace( '+00:00', '').replace('T', '').replace(':', '').replace('-', '') for idx, dayname in enumerate( [first_shap_day, middle_shap_day, last_shap_day]): day_csv_name = f"data/shap_values/{dataset}/{target}/{pipeline}/daily/shap_training_window_{symbol}_{label}_Wdays{t.window['days']}_DAY{dayname}.csv" # Plot each section's SHAP values cdf_subset = pd.read_csv(day_csv_name, index_col='time', parse_dates=True) train_subset = X_all.loc[cdf_subset.index] # Get a rank of feature labels based on this section's shap values abs_mean_shap = cdf_subset.abs().mean(axis='index') abs_mean_rank = abs_mean_shap.sort_values( ascending=False)[:shap_show_count] for l in abs_mean_rank.index: # Save labels for features in the top-N shap_plot_labels.add(l) # Plot this section's SHAP values plt.sca(beeswarms_axs[idx]) shap.summary_plot(cdf_subset.values, train_subset, max_display=shap_show_count, show=False, color_bar=False, sort=True) min_date = cdf_subset.index.min().to_pydatetime() max_date = cdf_subset.index.max().to_pydatetime( ) + timedelta(days=1) min_date_f = min_date.strftime("%Y/%m/%d") max_date_f = max_date.strftime("%Y/%m/%d") beeswarms_axs[idx].set_xlabel( f"SHAP values\nWindow: {min_date_f} - {max_date_f}", fontsize=8) beeswarms_axs[idx].tick_params(axis='y', which='major', labelsize=6) beeswarms_axs[idx].tick_params(axis='x', which='major', labelsize=8) # Plot shap values day_csv_name = f"data/shap_values/{dataset}/{target}/{pipeline}/shap_training_window_{symbol}_{label}_Wdays{t.window['days']}_.csv" plot_cls_shap_df = pd.read_csv(day_csv_name, index_col='time', parse_dates=True) def get_spread(series): return np.abs(series.max() - series.min()) plot_rank = plot_cls_shap_df[list(shap_plot_labels)].apply( get_spread, axis='index').sort_values( ascending=False)[:shap_show_count] plot_cls_shap_df['xlabel'] = [ t.to_pydatetime().strftime("%Y/%m/%d") for t in plot_cls_shap_df.index ] shap_ax = plot_cls_shap_df.plot( x='xlabel', y=[c for c in plot_rank.index], kind='line', ax=shap_values_ax, legend=False, xlabel='') patches, labels = shap_ax.get_legend_handles_labels() shap_ax.legend(patches, labels, loc='center left', bbox_to_anchor=(1, 0.5), prop={'size': 6}) shap_ax.tick_params(axis='x', which='major', labelsize=8) shap_ax.set_ylabel('mean(|SHAP|)', fontsize=6) #shap_ax.tick_params(labelbottom=False, labelleft=False) # Get Metrics scores dataframe cri_df = get_metrics_df(results).rolling( 7, min_periods=1).mean() cri_df['xlabel'] = [ t.to_pydatetime().strftime("%Y/%m/%d") for t in cri_df.index ] cri_ax = cri_df.plot(x='xlabel', y=f"pre_{cls}", kind='line', ax=precision_ax, legend=False, xlabel='') patches, labels = cri_ax.get_legend_handles_labels() cri_ax.legend(patches, labels, loc='center left', bbox_to_anchor=(1, 0.5), prop={'size': 6}) cri_ax.set_ylabel('mean(precision)', fontsize=6) cri_ax.tick_params(labelbottom=False, labelleft=True) min_date = cri_df.index.min().to_pydatetime().strftime( "%Y/%m/%d") max_date = cri_df.index.max().to_pydatetime().strftime( "%Y/%m/%d") window = t.window['days'] fig.suptitle( f"{symbol}, {pipeline}, W={window}D, Class {label}, From {min_date} to {max_date}" ) # fig.show() os.makedirs(f"images/shap-test-final/", exist_ok=True) plt.savefig( f"images/shap-test-final/{pipeline}_W{window}D_{dataset}_{target}_{symbol}_{label}.png", dpi='figure') plt.close() print(f"{label} OK") print(f"Exported symbol {symbol}.") # # Load day estimator # est = load_estimator() print(f"Plotted {symbol}")
def main(dataset: str, target: str, pipeline: str): shapes = [] ds_service = DatasetService() m_service = ModelService() for symbol in SYMBOLS: print(f"Exporting shap dataframes for symbol {symbol}") ds = ds_service.get_dataset(name=dataset, symbol=symbol) fs = DatasetService.get_feature_selection(ds=ds, method='importances_shap', target=target) X_all = ds_service.get_dataset_features(ds=ds, columns=fs.features) y_all = ds_service.get_dataset_target(ds=ds, name=target) model = m_service.get_model(pipeline=pipeline, dataset=dataset, target=target, symbol=symbol) for t in model.tests: os.makedirs( f"data/shap_values/{dataset}/{target}/{pipeline}/daily", exist_ok=True) placeholder = "{label}" csv_name = f"data/shap_values/{dataset}/{target}/{pipeline}/shap_training_window_{symbol}_{placeholder}_Wdays{t.window['days']}_.csv" day_csv_name = f"data/shap_values/{dataset}/{target}/{pipeline}/daily/shap_training_window_{symbol}_{placeholder}_Wdays{t.window['days']}_" print(f"Loading estimators for test {t.window}") estimators = ModelService.load_test_estimators(model=model, mt=t) results = ModelService.parse_test_results(test=t) shaps = [[], [], []] X_test = X_all.loc[t.test_interval.begin:t.test_interval.end] shap_expected = [] print(f"Calculating shap values") shap_abs_mean = [pd.DataFrame(), pd.DataFrame(), pd.DataFrame()] for est in tqdm(estimators): est_class = y_all.loc[est.day] training_data = est.train_x.astype(np.float64).fillna(value=0) shap_v, shap_exp = get_shap_values(estimator=est.named_steps.c, X=training_data, X_train=training_data, bytes=False) if isinstance(shap_exp, float): shap_expected.append([est.day] + [0, 0, shap_exp]) else: shap_expected.append([est.day] + [v for v in shap_exp]) for cls, label in enumerate(["SELL", "HOLD", "BUY"]): df = pd.DataFrame(shap_v[cls], index=est.train_x.index, columns=est.train_x.columns) # if not shaps[cls]: # If list is empty, append whole df # shaps[cls].append(df) # else: # shaps[cls].append(df.iloc[-1:]) # otherwise only append new row (sliding window) # Save shap values dataframe for each day dayname = est.day.replace('+00:00', '').replace('T', '').replace( ':', '').replace('-', '') day_class_csv_name = day_csv_name.format( label=label) + f"DAY{dayname}.csv" df.to_csv(day_class_csv_name, index_label='time') # Process data for next plot df_abs_mean = df.abs().mean().to_dict() df_abs_mean['time'] = est.day shaps[cls].append(df_abs_mean) # print(shap_abs_mean.head()) # Merge shap values in an unique dataframe and save to csv for each class for cls, label in enumerate(["SELL", "HOLD", "BUY"]): class_csv_name = csv_name.format(label=label) print( f"Exporting dataframe for class {label} -> {class_csv_name}" ) # cdf = pd.concat(shaps[cls], axis='index') cdf = pd.DataFrame.from_records(shaps[cls]) cdf.index = pd.to_datetime(cdf.time) cdf = cdf[cdf.columns.difference(['time'])] cdf.to_csv(class_csv_name, index_label='time') expected_csv_name = csv_name.format(label='SHAP_expected') print( f"Exporting expected values dataframe -> {expected_csv_name}") edf = pd.DataFrame( shap_expected, columns=[ "time", "shap_expected_sell", "shap_expected_hold", "shap_expected_buy" ], ) edf.to_csv(expected_csv_name, index_label='time') print(f"Exported symbol {symbol}.") # # Load day estimator # est = load_estimator() print(f"Plotted {symbol}")
def main(dataset: str, target: str, pipeline: str): hierarchy = load_hierarchy(f"{dataset}_{target}_feature_hierarchy.yml") hdf = pd.DataFrame(hierarchy) num_shap_plots = 3 shap_show_count = 10 ds_service = DatasetService() m_service = ModelService() for symbol in SYMBOLS: print(f"Plotting shap dataframes for symbol {symbol}") ds = ds_service.get_dataset(name=dataset, symbol=symbol) fs = DatasetService.get_feature_selection(ds=ds, method='importances_shap', target=target) X_all = ds_service.get_dataset_features(ds=ds, columns=fs.features) y_all = ds_service.get_dataset_target(ds=ds, name=target) model = m_service.get_model(pipeline=pipeline, dataset=dataset, target=target, symbol=symbol) for t in model.tests: os.makedirs( f"images/shap-test-hierarchy/{dataset}/{target}/{pipeline}/", exist_ok=True) placeholder = "{label}" csv_name = f"data/shap_values/{dataset}/{target}/{pipeline}/shap_training_window_{symbol}_{placeholder}_Wdays{t.window['days']}_.csv" expected_csv_name = csv_name.format(label='SHAP_expected') print(f"Loading results for test {t.window}") results = ModelService.parse_test_results(test=t) exp_shap_df = pd.read_csv(expected_csv_name, index_col='time', parse_dates=True) for cls, label in enumerate(["SELL", "HOLD", "BUY"]): class_csv_name = csv_name.format(label=label) cls_shap_df = pd.read_csv(class_csv_name, index_col='time', parse_dates=True) cls_shap_df = cls_shap_df.loc[t.test_interval.begin:t. test_interval.end] x_train = X_all.loc[cls_shap_df.index] chunk_size = int(cls_shap_df.shape[0] / num_shap_plots) # fig = plt.figure(constrained_layout=True, figsize=(100, 50), dpi=300) # # gs = GridSpec(3, num_shap_plots, figure=fig, wspace=1.5, hspace=0.3) # precision_ax = fig.add_subplot(gs[0, :]) # shap_values_ax = fig.add_subplot(gs[1, :]) # beeswarms_axs = [fig.add_subplot(gs[2, i]) for i in range(num_shap_plots)] # #format_axes(fig) # shap_plot_labels = set() # for idx, start in enumerate(range(0, cls_shap_df.shape[0], chunk_size)): # end = start + chunk_size # left = cls_shap_df.shape[0] - end # if left > 0 and left < chunk_size: # end += left # elif left < 0: # break # # Plot each section's SHAP values # cdf_subset = cls_shap_df.iloc[start:end] # train_subset = x_train.iloc[start:end] # # # Get a rank of feature labels based on this section's shap values # abs_mean_shap = cdf_subset.abs().mean(axis='index') # abs_mean_rank = abs_mean_shap.sort_values(ascending=False)[:shap_show_count] # for l in abs_mean_rank.index: # # Save labels for features in the top-N # shap_plot_labels.add(l) # # # Plot this section's SHAP values # plt.sca(beeswarms_axs[idx]) # shap.summary_plot( # cdf_subset.values, # train_subset, # max_display=shap_show_count, # show=False, # color_bar=False, # sort=True # ) # min_date = cdf_subset.index.min().to_pydatetime().strftime("%Y/%m/%d") # max_date = cdf_subset.index.max().to_pydatetime().strftime("%Y/%m/%d") # beeswarms_axs[idx].set_xlabel(f"SHAP values\n{min_date} - {max_date}", fontsize=8) # beeswarms_axs[idx].tick_params(axis='y', which='major', labelsize=6) # beeswarms_axs[idx].tick_params(axis='x', which='major', labelsize=8) # # Plot shap values # plot_cls_shap_df = cls_shap_df.abs().rolling(7, min_periods=1).mean() # def get_spread(series): # return np.abs(series.max() - series.min()) # plot_rank = plot_cls_shap_df[list(shap_plot_labels)].apply(get_spread, axis='index').sort_values(ascending=False)[:shap_show_count] # plot_cls_shap_df['xlabel'] = [t.to_pydatetime().strftime("%Y/%m/%d") for t in plot_cls_shap_df.index] # shap_ax = plot_cls_shap_df.plot( # x='xlabel', # y=[c for c in plot_rank.index], # kind='line', # ax=shap_values_ax, # legend=False, # xlabel='' # ) # patches, labels = shap_ax.get_legend_handles_labels() # shap_ax.legend( # patches, labels, # loc='center left', bbox_to_anchor=(1, 0.5), prop={'size': 6} # ) # shap_ax.tick_params(axis='x', which='major', labelsize=8) # shap_ax.set_ylabel('mean(|SHAP|)', fontsize=6) # #shap_ax.tick_params(labelbottom=False, labelleft=False) # # # Get Metrics scores dataframe # cri_df = get_metrics_df(results).rolling(7, min_periods=1).mean() # cri_df['xlabel'] = [t.to_pydatetime().strftime("%Y/%m/%d") for t in cri_df.index] # cri_ax = cri_df.plot( # x='xlabel', # y=f"pre_{cls}", # kind='line', # ax=precision_ax, # legend=False, # xlabel='' # ) # patches, labels = cri_ax.get_legend_handles_labels() # cri_ax.legend( # patches, labels, # loc='center left', bbox_to_anchor=(1, 0.5), prop={'size': 6} # ) # cri_ax.set_ylabel('mean(precision)', fontsize=6) # cri_ax.tick_params(labelbottom=False, labelleft=True) # # min_date = cri_df.index.min().to_pydatetime().strftime("%Y/%m/%d") # max_date = cri_df.index.max().to_pydatetime().strftime("%Y/%m/%d") # fig.suptitle(f"{pipeline}, {symbol}, class {label} tests from {min_date} to {max_date}") # # # fig.show() # plt.savefig( # f"images/shap-test/{pipeline}_{dataset}_{target}_{symbol}_{label}.png", # dpi='figure' # ) # plt.close() print(f"{label} OK") print(f"Exported symbol {symbol}.") # # Load day estimator # est = load_estimator() print(f"Plotted {symbol}")