def __reduce_overfitting(symbol, model_container): """ Recursive method to reduce Variance & get a better Validation score for the metric """ print('-- Exploring Model Generalization --') print('On Trainng data...') model_container.train_score = ModelEvaluator.evaluate( model_container.model, model_container.data.train_X, model_container.data.train_y) print('On Test data...') model_container.val_score = ModelEvaluator.evaluate( model_container.model, model_container.data.val_X, model_container.data.val_y) print( f'Train score: {model_container.train_score} & Validation score: {model_container.val_score}' ) if ( model_container.train_score - model_container.val_score ) / model_container.train_score > 0.15 and model_container.hyperparams.dropout < 0.55 and model_container.train_score > 0.65: # Try improving generalisation if difference between training & validation score should be less than 10% (but if validation score is good then don't and don't continue if validation score is below threshold) model_container.hyperparams.dropout += 0.2 model_container.model = Trainer.train_model( symbol, model_container.data_prep_params, model_container.data, model_container.hyperparams) return StockatronCore.__reduce_overfitting(symbol, model_container) else: return model_container
def find_best_single_feature_parameters(self, dataset): for feature in dataset.suggested_discretize_features: permutations = self.generate_feature_parameters(feature) print(permutations) best_mean_fcs = self.best_fcs[dataset] best_perm = None for p, perm in enumerate(permutations): logging.error("[Parameters Tester][{}][{}][Perm {:03d}] Current permutation: {}".format(dataset, feature, p+1, perm)) dm = DataModel.generate_from_file(dataset, discretize_params=perm) classes_list = dm.get_classes_list() f_scores = [] a = 1 for _ in range(self.best_fold[dataset][1]): for train_set, test_set in dm.generate_k_folds_stratified(self.best_fold[dataset][0]): model_evaluator = ModelEvaluator(train_set, test_set, classes_list) model_evaluator.evaluate() f_scores.append(model_evaluator.get_f_score()) logging.error("[Parameters Tester][{}][{}][Perm {:03d}][{:03d}] FCS: {}".format(dataset, feature, p+1, a, f_scores[-1])) a += 1 f_score_mean = sum(f_scores) / len(f_scores) logging.error("[Parameters Tester][{}][{}][Perm {:03d}] Best FCS: {}, Mean FCS {}".format(dataset, feature, p+1, max(f_scores), f_score_mean)) if f_score_mean > best_mean_fcs: best_perm = perm[0] best_mean_fcs = f_score_mean if best_perm is not None: self.best_discretize_feature_params[dataset].append(best_perm) logging.error("[Parameters Tester][{}][{}] Best mean FCS: {}, Best parameters: {}".format(dataset, feature, best_mean_fcs, best_perm))
def evaluate(self, *, images: np.ndarray = None, folder_path: str = None) -> None: """ Evaluate the model: calculate accuracy, show confusion matrix and print classification report. Works with either the images provided, or the path to these images. If none of them are provided, use the default test images path. """ if not self._model_loaded: raise ModelNotLoadedError( 'You have to load the model before evaluating it.') evaluator = ModelEvaluator(self._model, images=images, folder_path=folder_path) evaluator.evaluate()
def find_best_fold(self, dataset): dm = DataModel.generate_from_file(dataset) classes_list = dm.get_classes_list() for fold in FOLDS: f_scores = [] a = 1 for _ in range(fold[1]): for train_set, test_set in dm.generate_k_folds_stratified(fold[0]): model_evaluator = ModelEvaluator(train_set, test_set, classes_list) model_evaluator.evaluate() f_scores.append(model_evaluator.get_f_score()) logging.error("[Parameters Tester][{}][CV{:02d}][{:03d}] FCS: {}".format(dataset, fold[0], a, f_scores[-1])) a += 1 f_score_mean = sum(f_scores) / len(f_scores) logging.error("[Parameters Tester][{}][CV{:02d}] Best FCS: {}, Mean FCS {}".format(dataset, fold[0], max(f_scores), f_score_mean)) self.append_result({'dataset':dataset.name, 'fold':fold[0], 'f_score':f_score_mean, 'permutation':-1}) if f_score_mean > self.best_fcs[dataset]: self.best_fold[dataset] = fold self.best_fcs[dataset] = f_score_mean logging.error("[Parameters Tester][{}] Best mean FCS: {}, Best fold: {}".format(dataset, self.best_fcs[dataset], self.best_fold[dataset]))
def find_best_parameters(self, dataset): permutations = self.generate_permutations(dataset) for p, perm in enumerate(permutations): logging.error("[Parameters Tester][{}][Perm {:08d}] Current permutation: {}".format(dataset, p+1, perm)) dm = DataModel.generate_from_file(dataset, discretize_params=perm) classes_list = dm.get_classes_list() f_scores = [] a = 1 for _ in range(self.best_fold[dataset][1]): for train_set, test_set in dm.generate_k_folds_stratified(self.best_fold[dataset][0]): model_evaluator = ModelEvaluator(train_set, test_set, classes_list) model_evaluator.evaluate() f_scores.append(model_evaluator.get_f_score()) logging.error("[Parameters Tester][{}][Perm {:08d}][{:03d}] FCS: {}".format(dataset, p+1, a, f_scores[-1])) a += 1 f_score_mean = sum(f_scores) / len(f_scores) logging.error("[Parameters Tester][{}][Perm {:08d}] Best FCS: {}, Mean FCS {}".format(dataset, p+1, max(f_scores), f_score_mean)) for param in perm: self.append_result({'dataset':dataset.name, 'fold':self.best_fold[dataset][0], 'f_score':f_score_mean, 'permutation':p + 1, 'feature':param.feature_name, 'function':param.discretize_function.__name__, 'bins':param.buckets_amount}) if f_score_mean > self.best_fcs[dataset]: self.best_discretize_parameters[dataset] = perm self.best_fcs[dataset] = f_score_mean logging.error("[Parameters Tester][{}] Best mean FCS: {}, Best parameters: {}".format(dataset, self.best_fcs[dataset], self.best_discretize_parameters[dataset]))
def results(regressors, datasets, epochs=1000, verbose=False): '''Saves timings for regressors to filename.txt''' col_names = [''] * len(regressors) row_names = [''] * len(datasets) results = np.zeros((len(datasets), len(regressors), 2)) for dataset_i, DatasetInitializer in enumerate(datasets): # intialize dataset dataset = DatasetInitializer() row_names[dataset_i] = dataset.name if verbose: print(dataset.name) for regressor_i, Regressor in enumerate(regressors): # intialize model regualizer = getattr(dataset.regualizer, Regressor.transform_type) regression = Regressor(input_size=dataset.input_size, output_size=dataset.output_size, random_state=42, regualizer=regualizer, learning_rate=dataset.learning_rate) col_names[regressor_i] = regression.name if verbose: print(' ' + regression.name) with regression as model: model.reset() model.update(dataset.train.inputs, dataset.train.targets, epochs=min(epochs, dataset.epochs)) divergence = ModelEvaluator.evaluate(model, dataset.test.inputs, dataset.test.targets) results[dataset_i, regressor_i, 0] = divergence if dataset.multi_class: missrate = np.nan else: missrate = model.error(dataset.test.inputs, dataset.test.targets) results[dataset_i, regressor_i, 1] = missrate if verbose: print(' %f / %f' % (divergence, missrate)) return (results, col_names, row_names)
def train_model(self, symbol): models = [] # clean up previous training plots for file in glob.glob(f'training_plots/{symbol}/*'): os.remove(file) df = yf.get_ticker(symbol, start_date=self.start_date) num_time_steps_to_try = [30] for num_time_steps in num_time_steps_to_try: data_prep_params = DataPrepParameters( scaler=StandardScaler(), num_time_steps=num_time_steps, features=['change', 'sp500_change']) data = self.data_chef.prepare_model_data(df, data_prep_params) for batch_size in [ 1, 5 ]: # can try more batch sizes as stateless LSTM's only keep state/context within a batch so it's an important hyperparameter to explore hyperparams = ModelHyperparameters( epochs=100, number_hidden_layers=2, number_units_in_hidden_layers=20, hidden_activation_fn='tanh', optimizer='adam', dropout=0, kernel_initializer="glorot_uniform", batch_size=batch_size) model = Trainer.train_model(symbol, data_prep_params, data, hyperparams) model_container = StockatronCore.__reduce_underfitting( symbol, model, hyperparams, data, data_prep_params) models.append(model_container) if model_container.train_score > 0.85: break best_fit_model_container = max(models, key=operator.attrgetter("train_score")) best_fit_model_container = StockatronCore.__reduce_overfitting( symbol, best_fit_model_container) # Only now that the model has been selected, evaluate its worth using the untouched test set best_fit_model_container.test_score = ModelEvaluator.evaluate( best_fit_model_container.model, best_fit_model_container.data.test_X, best_fit_model_container.data.test_y) print( f'Best Model for {symbol} has train score={best_fit_model_container.train_score} validation score={best_fit_model_container.val_score} & test score={best_fit_model_container.test_score}' ) best_fit_model_container.version = f'{symbol}_{date.today().strftime("%Y-%m-%d")}' StockatronCore.__save_new_model(best_fit_model_container)
def __reduce_underfitting(symbol, model, hyperparams, data, data_prep_params): """ Recursive method to reduce Bias & get a better Training score for the metric """ print('-- Exploring Model Fit --') train_score = ModelEvaluator.evaluate(model, data.train_X, data.train_y) if train_score < 0.7 and hyperparams.number_hidden_layers < 3: if hyperparams.epochs < 800: # first run for longer hyperparams.epochs += 100 elif hyperparams.number_hidden_layers < 5: # if still not meeting the training score threshold then increase complexity of model hyperparams.number_hidden_layers += 1 model = Trainer.train_model(symbol, data_prep_params, data, hyperparams) return StockatronCore.__reduce_underfitting( symbol, model, hyperparams, data, data_prep_params) else: return ModelContainer(model=model, hyperparams=hyperparams, data_prep_params=data_prep_params, data=data, train_score=train_score)