def _validate_k_fold_top(self, model, x_train, y_train, x_test, y_test): validation_quantities = [] for k_min in self.k_mins: for k_max in self.k_maxs: for dist_percentage in self.dist_percentages: print( f"k_min, k_max, dist_percentage: {k_min}, {k_max}, {dist_percentage}" ) pipeline_list = [ ('extract_subspaces', SubSpaceExtraction(dist_percentage=dist_percentage, k_min=k_min, k_max=k_max, metric="euclidean", n_jobs=-1)), ('compute_diagrams', VietorisRipsPersistence(n_jobs=-1)) ] top_pipeline = Pipeline(pipeline_list) diagrams_train, _ = top_pipeline.fit_transform_resample( x_train, y_train) top_features_train = extract_topological_features( diagrams_train) x_train_model = np.concatenate( [x_train, top_features_train], axis=1) model.fit(x_train_model, y_train) x_test_model = extract_features_for_prediction( x_train, y_train, x_test, y_test, top_pipeline) score = model.score(x_test_model, y_test) output_dictionary = { "k_min": k_min, "k_max": k_max, "dist_percentage": dist_percentage, "score": score } validation_quantities.append(output_dictionary) return validation_quantities
def cross_validate(self, full_x, full_y, splitting_dates): train_split_date = splitting_dates[0] val_split_date = splitting_dates[1] end_date = splitting_dates[2] train_x = full_x[(full_x.date < train_split_date) | (full_x.date >= end_date)] train_y = full_y[(full_x.date < train_split_date) | (full_x.date >= end_date)] val_x = full_x[(full_x.date >= train_split_date) & (full_x.date < val_split_date)] val_y = full_y[(full_x.date >= train_split_date) & (full_x.date < val_split_date)] test_x = full_x[(full_x.date >= val_split_date) & (full_x.date < end_date)] test_y = full_y[(full_x.date >= val_split_date) & (full_x.date < end_date)] train_x.pop("date") val_x.pop("date") test_x.pop("date") train_x = train_x.values train_y = train_y.values val_x = val_x.values val_y = val_y.values test_x = test_x.values test_y = test_y.values print("START VALIDATING MODEL") models_cv = self._validate_k_fold_model(train_x, train_y, val_x, val_y) best_model_params = best_combination(models_cv) best_model_params.pop("score") best_model = RandomForestClassifier(**best_model_params) best_model.fit(train_x, train_y) score = best_model.score(test_x, test_y) print(f'score no_top {score}') print(f'best model parameters no_top {best_model_params}') print("START VALIDATING PARAMS") topo_cv = self._validate_k_fold_top(best_model, train_x, train_y, val_x, val_y) best_topo = best_combination(topo_cv) best_topo.pop("score") best_topo_pipeline_list = [ ('extract_subspaces', SubSpaceExtraction(**best_topo)), ('compute_diagrams', VietorisRipsPersistence(n_jobs=-1)) ] best_topo_pipeline = Pipeline(best_topo_pipeline_list) train_x_for_test = np.concatenate([train_x, val_x], axis=0) train_y_for_test = np.concatenate([train_y, val_y], axis=0) diagrams_train, _ = best_topo_pipeline.fit_transform_resample( train_x_for_test, train_y_for_test) print("EXTRACTING TOPOLOGICAL FEATURES TRAIN") top_features_train = extract_topological_features(diagrams_train) x_train_model = np.concatenate([train_x_for_test, top_features_train], axis=1) best_model.fit(x_train_model, train_y_for_test) print("EXTRACTING TOPOLOGICAL FEATURES TEST") x_test_model = extract_features_for_prediction(x_train_model, train_y_for_test, test_x, test_y, best_topo_pipeline) score_top = best_model.score(x_test_model, test_y) val_x_with_topo = extract_features_for_prediction( train_x, train_y, val_x, val_y, best_topo_pipeline) print('START VALIDATING MODEL WITH OPTIMAL TOPOLOGY') model_config_with_topo = self._validate_k_fold_model( x_train_model, train_y, val_x_with_topo, val_y) best_model_config_with_topo = best_combination(model_config_with_topo) best_model_config_with_topo.pop('score') best_model_with_topo = RandomForestClassifier( **best_model_config_with_topo) best_model_with_topo.fit(x_train_model, train_y_for_test) score_best_topo_and_model = best_model_with_topo.score( x_test_model, test_y) print(f'score best model and topo_feat {score_best_topo_and_model}') return best_model_params, best_topo, best_model_config_with_topo, score, score_top, score_best_topo_and_model