def find_optimal_featureset(self, master_df, cv_df, features_list): """Given a list of features (string form), returns the score and dataframe of features that have the strongest predictive power on the cross validation set using 5-fold cross validation. Combinations of the features list are generated and checked for candidacy of being the very best that no one ever was Returned score is R2 score (default for sklearn.LinearRegression) """ meval = ModelEvaluator() dataframe_list = meval.generate_feature_combinations(features_list, master_df) cv_dataframe_list = meval.generate_feature_combinations(features_list, cv_df) both_lists = zip(dataframe_list, cv_dataframe_list) y = master_df['song_hotttnesss'] y_cv = cv_df['song_hotttnesss'] scores = [] for df, cv in both_lists: X = df X_cv = cv model = LinearRegression() model.fit(X, y) # score = meval.cross_validation_score(model, X_cv, y_cv, 5) score = model.score(X_cv, y_cv) scores.append(score) max_idx = scores.index(max(scores)) return dataframe_list[max_idx], scores[max_idx]
def setUp(self): self.model = Model(20) self.model.load('../testdata/lda_model') self.vocabulary = Vocabulary() self.vocabulary.load('../testdata/vocabulary.dat') self.model_evaluator = ModelEvaluator(self.model, self.vocabulary)
def __reduce_overfitting(symbol, model_container): """ Recursive method to reduce Variance & get a better Validation score for the metric """ print('-- Exploring Model Generalization --') print('On Trainng data...') model_container.train_score = ModelEvaluator.evaluate( model_container.model, model_container.data.train_X, model_container.data.train_y) print('On Test data...') model_container.val_score = ModelEvaluator.evaluate( model_container.model, model_container.data.val_X, model_container.data.val_y) print( f'Train score: {model_container.train_score} & Validation score: {model_container.val_score}' ) if ( model_container.train_score - model_container.val_score ) / model_container.train_score > 0.15 and model_container.hyperparams.dropout < 0.55 and model_container.train_score > 0.65: # Try improving generalisation if difference between training & validation score should be less than 10% (but if validation score is good then don't and don't continue if validation score is below threshold) model_container.hyperparams.dropout += 0.2 model_container.model = Trainer.train_model( symbol, model_container.data_prep_params, model_container.data, model_container.hyperparams) return StockatronCore.__reduce_overfitting(symbol, model_container) else: return model_container
def find_best_single_feature_parameters(self, dataset): for feature in dataset.suggested_discretize_features: permutations = self.generate_feature_parameters(feature) print(permutations) best_mean_fcs = self.best_fcs[dataset] best_perm = None for p, perm in enumerate(permutations): logging.error("[Parameters Tester][{}][{}][Perm {:03d}] Current permutation: {}".format(dataset, feature, p+1, perm)) dm = DataModel.generate_from_file(dataset, discretize_params=perm) classes_list = dm.get_classes_list() f_scores = [] a = 1 for _ in range(self.best_fold[dataset][1]): for train_set, test_set in dm.generate_k_folds_stratified(self.best_fold[dataset][0]): model_evaluator = ModelEvaluator(train_set, test_set, classes_list) model_evaluator.evaluate() f_scores.append(model_evaluator.get_f_score()) logging.error("[Parameters Tester][{}][{}][Perm {:03d}][{:03d}] FCS: {}".format(dataset, feature, p+1, a, f_scores[-1])) a += 1 f_score_mean = sum(f_scores) / len(f_scores) logging.error("[Parameters Tester][{}][{}][Perm {:03d}] Best FCS: {}, Mean FCS {}".format(dataset, feature, p+1, max(f_scores), f_score_mean)) if f_score_mean > best_mean_fcs: best_perm = perm[0] best_mean_fcs = f_score_mean if best_perm is not None: self.best_discretize_feature_params[dataset].append(best_perm) logging.error("[Parameters Tester][{}][{}] Best mean FCS: {}, Best parameters: {}".format(dataset, feature, best_mean_fcs, best_perm))
def run_model(model_name, data): dh = DataHandler() dh.import_data(data) dh.create_targets(-1) model = build_model(model_name, dh) m_eval = ModelEvaluator(model) acc, std = m_eval.n_time_k_cross_fold(10, 5) print('Accuracy: {}\nStandard Deviation: {}\n'.format(acc, std))
def hyperparameter(regressors, datasets, regualizer_values, epochs=1000, n_splits=5, verbose=False): col_names = [''] * len(regressors) row_names = [''] * len(datasets) results = np.zeros( (len(datasets), len(regressors), len(regualizer_values), n_splits)) for dataset_i, DatasetInitializer in enumerate(datasets): # intialize dataset dataset = DatasetInitializer() row_names[dataset_i] = dataset.name if verbose: print(dataset.name) for regressor_i, Regressor in enumerate(regressors): for regualizer_i, regualizer in enumerate(regualizer_values): # intialize model regression = Regressor(input_size=dataset.input_size, output_size=dataset.output_size, random_state=42, regualizer=regualizer, learning_rate=dataset.learning_rate) col_names[regressor_i] = regression.name if regualizer_i == 0 and verbose: print(' ' + regression.name) with regression as model: evaluator = ModelEvaluator(model, dataset.train, epochs=min( epochs, dataset.epochs), random_state=42) divergence = evaluator.all_folds( n_splits=n_splits, stratified=dataset.stratified) if verbose: print(' %e: %f' % (regualizer, np.mean(divergence))) results[dataset_i, regressor_i, regualizer_i, :] = \ divergence return (results, col_names, row_names)
def evaluate(self, *, images: np.ndarray = None, folder_path: str = None) -> None: """ Evaluate the model: calculate accuracy, show confusion matrix and print classification report. Works with either the images provided, or the path to these images. If none of them are provided, use the default test images path. """ if not self._model_loaded: raise ModelNotLoadedError( 'You have to load the model before evaluating it.') evaluator = ModelEvaluator(self._model, images=images, folder_path=folder_path) evaluator.evaluate()
def _check_evaluation(preprocessor, model, metrics: Dict[str, float]): evaluator = ModelEvaluator(metrics_class=BinaryClassificationMetrics) # The purpose of this parameter is to prove names can be arbitrary in the compare method dataframes_sets = [['train', 'test'], ['train1', 'test1']] for dataframes in dataframes_sets: comparison = evaluator.compare(data_frames={ dataframe: preprocessor.train_encoded_df for dataframe in dataframes }, models=[model]) assert isinstance(comparison, pandas.DataFrame) for metric in metrics: assert metric in comparison for dataframe in dataframes: assert \ comparison[metric][evaluator.index_key(dataframe, model)] == \ pytest.approx(metrics[metric], abs=0.05)
def find_best_fold(self, dataset): dm = DataModel.generate_from_file(dataset) classes_list = dm.get_classes_list() for fold in FOLDS: f_scores = [] a = 1 for _ in range(fold[1]): for train_set, test_set in dm.generate_k_folds_stratified(fold[0]): model_evaluator = ModelEvaluator(train_set, test_set, classes_list) model_evaluator.evaluate() f_scores.append(model_evaluator.get_f_score()) logging.error("[Parameters Tester][{}][CV{:02d}][{:03d}] FCS: {}".format(dataset, fold[0], a, f_scores[-1])) a += 1 f_score_mean = sum(f_scores) / len(f_scores) logging.error("[Parameters Tester][{}][CV{:02d}] Best FCS: {}, Mean FCS {}".format(dataset, fold[0], max(f_scores), f_score_mean)) self.append_result({'dataset':dataset.name, 'fold':fold[0], 'f_score':f_score_mean, 'permutation':-1}) if f_score_mean > self.best_fcs[dataset]: self.best_fold[dataset] = fold self.best_fcs[dataset] = f_score_mean logging.error("[Parameters Tester][{}] Best mean FCS: {}, Best fold: {}".format(dataset, self.best_fcs[dataset], self.best_fold[dataset]))
def test_model(n_gram_mins): fe = FeatureExtractor("../dataset/slack_dialogue.txt", n_grams=[1, 2, 3, 4], n_gram_mins=n_gram_mins, debug=False) fe.load() me = ModelEvaluator(fe.headers, fe.features) model_array, highest_rate = me.search_initial_best_fit_algorithm() chosen_model = model_array[random.randint(0, len(model_array) - 1)] mb = ModelBuilder(chosen_model) X_train, X_validation, Y_train, Y_validation = me.split_dataset() mb.fit_model(X_train, Y_train) accuracy_score = mb.accuracy_score(X_validation, Y_validation) print("Got score: " + str(accuracy_score) + " with model: " + str(model_array)) print("Using : " + str(n_gram_mins)) return accuracy_score, model_array
def load(self, select_new_best_model=False): ''' Reloads data from the file and selects the best model. Useful when there are automated updates to datasets. ''' self.fe.load() self.me = ModelEvaluator(self.fe.headers, self.fe.features) if select_new_best_model: self.me = ModelEvaluator(self.fe.headers, self.fe.features) self.model_array, self.highest_rate = self.me.search_initial_best_fit_algorithm() self.chosen_model = self.model_array[ random.randint(0, len(self.model_array) - 1)] self.mb = ModelBuilder(self.chosen_model) self.X_train, self.X_validation, self.Y_train, self.Y_validation = self.me.split_dataset() self.mb.fit_model(self.X_train, self.Y_train) self.accuracy_score = self.mb.accuracy_score(self.X_validation, self.Y_validation)
def find_best_parameters(self, dataset): permutations = self.generate_permutations(dataset) for p, perm in enumerate(permutations): logging.error("[Parameters Tester][{}][Perm {:08d}] Current permutation: {}".format(dataset, p+1, perm)) dm = DataModel.generate_from_file(dataset, discretize_params=perm) classes_list = dm.get_classes_list() f_scores = [] a = 1 for _ in range(self.best_fold[dataset][1]): for train_set, test_set in dm.generate_k_folds_stratified(self.best_fold[dataset][0]): model_evaluator = ModelEvaluator(train_set, test_set, classes_list) model_evaluator.evaluate() f_scores.append(model_evaluator.get_f_score()) logging.error("[Parameters Tester][{}][Perm {:08d}][{:03d}] FCS: {}".format(dataset, p+1, a, f_scores[-1])) a += 1 f_score_mean = sum(f_scores) / len(f_scores) logging.error("[Parameters Tester][{}][Perm {:08d}] Best FCS: {}, Mean FCS {}".format(dataset, p+1, max(f_scores), f_score_mean)) for param in perm: self.append_result({'dataset':dataset.name, 'fold':self.best_fold[dataset][0], 'f_score':f_score_mean, 'permutation':p + 1, 'feature':param.feature_name, 'function':param.discretize_function.__name__, 'bins':param.buckets_amount}) if f_score_mean > self.best_fcs[dataset]: self.best_discretize_parameters[dataset] = perm self.best_fcs[dataset] = f_score_mean logging.error("[Parameters Tester][{}] Best mean FCS: {}, Best parameters: {}".format(dataset, self.best_fcs[dataset], self.best_discretize_parameters[dataset]))
def results(regressors, datasets, epochs=1000, verbose=False): '''Saves timings for regressors to filename.txt''' col_names = [''] * len(regressors) row_names = [''] * len(datasets) results = np.zeros((len(datasets), len(regressors), 2)) for dataset_i, DatasetInitializer in enumerate(datasets): # intialize dataset dataset = DatasetInitializer() row_names[dataset_i] = dataset.name if verbose: print(dataset.name) for regressor_i, Regressor in enumerate(regressors): # intialize model regualizer = getattr(dataset.regualizer, Regressor.transform_type) regression = Regressor(input_size=dataset.input_size, output_size=dataset.output_size, random_state=42, regualizer=regualizer, learning_rate=dataset.learning_rate) col_names[regressor_i] = regression.name if verbose: print(' ' + regression.name) with regression as model: model.reset() model.update(dataset.train.inputs, dataset.train.targets, epochs=min(epochs, dataset.epochs)) divergence = ModelEvaluator.evaluate(model, dataset.test.inputs, dataset.test.targets) results[dataset_i, regressor_i, 0] = divergence if dataset.multi_class: missrate = np.nan else: missrate = model.error(dataset.test.inputs, dataset.test.targets) results[dataset_i, regressor_i, 1] = missrate if verbose: print(' %f / %f' % (divergence, missrate)) return (results, col_names, row_names)
def train_model(self, symbol): models = [] # clean up previous training plots for file in glob.glob(f'training_plots/{symbol}/*'): os.remove(file) df = yf.get_ticker(symbol, start_date=self.start_date) num_time_steps_to_try = [30] for num_time_steps in num_time_steps_to_try: data_prep_params = DataPrepParameters( scaler=StandardScaler(), num_time_steps=num_time_steps, features=['change', 'sp500_change']) data = self.data_chef.prepare_model_data(df, data_prep_params) for batch_size in [ 1, 5 ]: # can try more batch sizes as stateless LSTM's only keep state/context within a batch so it's an important hyperparameter to explore hyperparams = ModelHyperparameters( epochs=100, number_hidden_layers=2, number_units_in_hidden_layers=20, hidden_activation_fn='tanh', optimizer='adam', dropout=0, kernel_initializer="glorot_uniform", batch_size=batch_size) model = Trainer.train_model(symbol, data_prep_params, data, hyperparams) model_container = StockatronCore.__reduce_underfitting( symbol, model, hyperparams, data, data_prep_params) models.append(model_container) if model_container.train_score > 0.85: break best_fit_model_container = max(models, key=operator.attrgetter("train_score")) best_fit_model_container = StockatronCore.__reduce_overfitting( symbol, best_fit_model_container) # Only now that the model has been selected, evaluate its worth using the untouched test set best_fit_model_container.test_score = ModelEvaluator.evaluate( best_fit_model_container.model, best_fit_model_container.data.test_X, best_fit_model_container.data.test_y) print( f'Best Model for {symbol} has train score={best_fit_model_container.train_score} validation score={best_fit_model_container.val_score} & test score={best_fit_model_container.test_score}' ) best_fit_model_container.version = f'{symbol}_{date.today().strftime("%Y-%m-%d")}' StockatronCore.__save_new_model(best_fit_model_container)
def __reduce_underfitting(symbol, model, hyperparams, data, data_prep_params): """ Recursive method to reduce Bias & get a better Training score for the metric """ print('-- Exploring Model Fit --') train_score = ModelEvaluator.evaluate(model, data.train_X, data.train_y) if train_score < 0.7 and hyperparams.number_hidden_layers < 3: if hyperparams.epochs < 800: # first run for longer hyperparams.epochs += 100 elif hyperparams.number_hidden_layers < 5: # if still not meeting the training score threshold then increase complexity of model hyperparams.number_hidden_layers += 1 model = Trainer.train_model(symbol, data_prep_params, data, hyperparams) return StockatronCore.__reduce_underfitting( symbol, model, hyperparams, data, data_prep_params) else: return ModelContainer(model=model, hyperparams=hyperparams, data_prep_params=data_prep_params, data=data, train_score=train_score)
class ModelEvaluatorTest(unittest.TestCase): def setUp(self): self.model = Model(20) self.model.load('../testdata/lda_model') self.vocabulary = Vocabulary() self.vocabulary.load('../testdata/vocabulary.dat') self.model_evaluator = ModelEvaluator(self.model, self.vocabulary) def test_compute_loglikelihood(self): doc_tokens = ['macbook', 'ipad', # exist in vocabulary and model 'mac os x', 'chrome', # only exist in vocabulary 'nokia', 'null'] # inexistent document = Document(self.model.num_topics) rand = random.Random() rand.seed(0) document.parse_from_tokens( doc_tokens, rand, self.vocabulary, self.model) documents = [document, document] self.assertEqual(-14.113955684239654, self.model_evaluator.compute_loglikelihood(documents))
def evaluate_model_with_mean_absolute_error(self, prediction: np.ndarray): return ModelEvaluator(self.target, prediction).evaluate_mean_absolute_error()
class AlphaLayer: # Default constructor including path and debug toggle. Also includes count algorithm default and specification # This constructor does have a way to turn off the load of spacy. This is not meant for production and # should only be done to speed up debug times. def __init__(self, path, debug=False, count_algorithm=StandardPresenceBoolean(), load_spacy=True): self.spacy_loaded = load_spacy self.path = path self.debug = debug self.fe = FeatureExtractor(path, self.debug, count_algorithm=count_algorithm, load_spacy=load_spacy) self.load(True) if self.debug: print("Accuracy score: " + str( self.accuracy_score) + " with classifier " + self.chosen_model + " out of " + str(self.model_array)) def load(self, select_new_best_model=False): ''' Reloads data from the file and selects the best model. Useful when there are automated updates to datasets. ''' self.fe.load() self.me = ModelEvaluator(self.fe.headers, self.fe.features) if select_new_best_model: self.me = ModelEvaluator(self.fe.headers, self.fe.features) self.model_array, self.highest_rate = self.me.search_initial_best_fit_algorithm() self.chosen_model = self.model_array[ random.randint(0, len(self.model_array) - 1)] self.mb = ModelBuilder(self.chosen_model) self.X_train, self.X_validation, self.Y_train, self.Y_validation = self.me.split_dataset() self.mb.fit_model(self.X_train, self.Y_train) self.accuracy_score = self.mb.accuracy_score(self.X_validation, self.Y_validation) # Change dataset path def change_path(self, path): self.path = path self.fe.path = path # Append a line to the dataset. Caution: no formatting checks are done in this method. def add_line(self, line): with open(self.path, "a") as datafile: datafile.write('\n' + line) def handle_buy_item(self, sentence): return "Got classifier: buy\nThank you for purchasing " + self.evaluate(sentence) def handle_open_shop(self, sentence): return "Got classifier: shop\nHere you go, take a look at my wares.\n" def handle_conversation(self, sentence): return "Got classifier: convo\nI don't feel like talking to you" def handle_undo(self, sentence): return "Got classifier: undo\nReally mate? You sure you want to take it back?" def handle_sell(self, sentence): return "Got classifier: sell\nI'll gladly accept your " + self.evaluate(sentence) # Evaluate a string with spacy classifier def evaluate(self, line): doc = self.fe.parser(unicode(line)) file_str = StringIO() current_string = "" compound_number = "" for word in doc: if word.pos_ == u'NOUN' or word.pos_ == u'PROPN': # Probably the thing we want to buy current_string += word.text + " " if word.pos_ == u'NUM': # This is an amount. compound_number += word.text + " " if word.pos_ == u'CONJ': # Consider this termination of the current item. Record amount and such. if current_string != "": # Only terminate if they specified a thing to terminate with current_amount = parse(compound_number.strip()) file_str.write(str(current_amount)) file_str.write(" ") file_str.write(current_string.strip()) file_str.write("; ") current_string = "" if word.pos_ == u'PUNCT': if word.text == u';': current_amount = parse(compound_number.strip()) file_str.write(str(current_amount)) file_str.write(" ") file_str.write(current_string.strip()) file_str.write("; ") current_string = "" if word.text == u',': current_amount = parse(compound_number.strip()) file_str.write(str(current_amount)) file_str.write(" ") file_str.write(current_string.strip()) file_str.write("; ") current_string = "" if current_string != "": current_amount = parse(compound_number.strip()) if compound_number.strip() != "" else 1 file_str.write(str(current_amount)) file_str.write(" ") file_str.write(current_string.strip()) return file_str.getvalue()
def main(): #from sklearn.datasets import load_boston #boston = load_boston() #print(boston.data.shape) logging.basicConfig(stream=sys.stdout, level=logging.DEBUG if DEBUG else logging.INFO) dataFile = "data/housing.data" col_names = ["crim", "zn", "indus", "chas", "nox", "rm", "age", "dis", "rad", "tax", "ptratio", "b", "lstat", "medv"] train_df = pd.read_csv(dataFile, names = col_names, delim_whitespace = True) test_df = train_df.iloc[::7, :] train_df.drop(train_df.index[::7], inplace=True) train_df_features = train_df.iloc[:, :-1] train_df_targets = train_df.iloc[:, -1] test_df_features = test_df.iloc[:, :-1] test_df_targets = test_df.iloc[:, -1] # Data analysis print("Data analysis:") print("No. of attributes: ", len(train_df.iloc[0])) print("No. of features usable for classifcation: ", len(train_df.iloc[0])-1) print("Size of training data: ", len(train_df)) print("Size of testing data: ", len(test_df)) print("Histogram of attributes will be shown at the end of generating all results") print("\nPearson correlations:") target_col = col_names[-1] for col in col_names: if col.lower() == 'chas': # categorical. Also, see dtypes continue print("Correlation of %s with target(%s): %f" % (col, target_col, train_df[[col, target_col]].corr(method='pearson').iloc[0,1])) normalizer = DataFrameStdNormalizer(train_df_features) train_df_features_normalized = normalizer.get_normalized_data(train_df_features) test_df_features_normalized = normalizer.get_normalized_data(test_df_features) print("\n*********************Linear Regression*******************") regmodel = LinearRegression() eval = ModelEvaluator(regmodel) regmodel.train(train_df_features_normalized, train_df_targets) trainingError = eval.mean_squared_error(train_df_features_normalized, train_df_targets) print("Mean squared error on training data: %f" % trainingError) print("Mean squared error on test data: %f" % eval.mean_squared_error(test_df_features_normalized, test_df_targets)) print("\n***********Ridge regression with lambda 0.01m 0.1, 1.0***************") for lambdaval in (0.01, 0.1, 1.0): regmodel = RidgeRegression(lambdaval) eval = ModelEvaluator(regmodel) regmodel.train(train_df_features_normalized, train_df_targets) trainingError = eval.mean_squared_error(train_df_features_normalized, train_df_targets) testingError = eval.mean_squared_error(test_df_features_normalized, test_df_targets) print("Ridge regression model with lambda = %f" % lambdaval) print("Mean squared error on training data = %f" % trainingError) print("Mean squared error on test data = %f" % testingError) print("") print("\n*********************Cross Validation*******************") lambdaval = float(10.0) # Shuffle data shuffled_train_df = train_df.reindex(np.random.permutation(train_df.index)) shuffled_train_df_features = train_df.iloc[:, :-1] shuffled_train_df_targets = train_df.iloc[:, -1] shuffled_train_df_features_normalized = (DataFrameStdNormalizer(shuffled_train_df_features)).get_normalized_data(shuffled_train_df_features) lambda_error_map = {} for i in range(0,6): lambdaval = float(10.0) / (10**i) # cross validation mean_cv_error = 0 regmodel = RidgeRegression(lambdaval) eval = ModelEvaluator(regmodel) for i in range(0,10): chunksize = len(train_df)/10 test_df_cv = None train_df_cv_targets = None test_df_cv = None test_df_cv_targets = None test_df_cv = shuffled_train_df_features_normalized.iloc[i*chunksize:i*chunksize+chunksize] test_df_cv_targets = shuffled_train_df_targets.iloc[i*chunksize:i*chunksize+chunksize] train_df_cv = shuffled_train_df_features_normalized.drop(shuffled_train_df_features_normalized.index[i*chunksize:i*chunksize+chunksize]) train_df_cv_targets = shuffled_train_df_targets.drop(shuffled_train_df_targets.index[i*chunksize:i*chunksize+chunksize]) regmodel.train(train_df_cv, train_df_cv_targets) #print(eval.mean_squared_error(test_df_cv, test_df_cv_targets)) mean_cv_error += eval.mean_squared_error(test_df_cv, test_df_cv_targets) mean_cv_error /= 10 print("MSE for lambda %f = %f" % (lambdaval, mean_cv_error)) lambda_error_map[lambdaval] = mean_cv_error lambdabest = min(lambda_error_map, key=lambda_error_map.get) print("Lowest MSE for lambda = %f" % lambdabest) regmodel = RidgeRegression(lambdabest) regmodel.train(train_df_features_normalized, train_df_targets) eval = ModelEvaluator(regmodel) test_meansquarederror = eval.mean_squared_error(test_df_features_normalized, test_df_targets) print("Test error for model with lambda %f = %f" % (lambdabest, test_meansquarederror)) print("") print("\n*********************Feature Selection*******************") print("*********************i. Max correlation*******************") target_col = col_names[-1] corr = {} for col in col_names: if col.lower() == 'chas': # categorical. Also, see dtypes continue corr[col] = abs(train_df[[col, target_col]].corr(method='pearson').iloc[0,1]) maxcorrcols = heapq.nlargest(5, corr, key=corr.get)[1:] print("Selecting the following coluns with max correlation: ") print(maxcorrcols) train_df_features_normalized_maxcorr = train_df_features[maxcorrcols] regmodel = LinearRegression() regmodel.train(train_df_features_normalized_maxcorr, train_df_targets) eval = ModelEvaluator(regmodel) trainingError = eval.mean_squared_error(train_df_features[maxcorrcols], train_df_targets) print("Mean squared error on training data: %f" % trainingError) print("Mean squared error on test data: %f" % eval.mean_squared_error(test_df_features[maxcorrcols], test_df_targets)) print("*******************ii. Max correlation with residue*****************") residue = train_df_targets.copy(deep=True) cols = [] regmodel = LinearRegression() eval = ModelEvaluator(regmodel) for i in range(0, 4): corr = {} for col in col_names: if col.lower() in ('medv', 'chas') or col in cols: # categorical. Also, see dtypes continue # corr[col] = train_df[[col]].corrwith(residue).iloc[0] corr[col] = abs(pd.concat([train_df[[col]], residue], axis = 1).corr(method='pearson').iloc[0,1]) maxcorrcol = max(corr, key=corr.get) cols.append(maxcorrcol) print("Taking cols: %s" % maxcorrcol) regmodel.train(train_df_features[cols], train_df_targets) for i in range(0,len(residue)): residue.at[residue.index[i]] = train_df_targets.iloc[i] - regmodel.predict(train_df_features[cols].iloc[i]) #trainingError = eval.mean_squared_error(train_df_features_normalized, train_df_targets) #print("Mean squared error on training data: %f" % trainingError) #print(cols) print("Mean squared error on train data: %f" % eval.mean_squared_error(train_df_features[cols], train_df_targets)) print("Mean squared error on test data: %f" % eval.mean_squared_error(test_df_features[cols], test_df_targets)) print("*********************iii. All 4 feature combinations*******************") bestcols = None besttrainmse = 999999 regmodel = LinearRegression() eval = ModelEvaluator(regmodel) for cols in list(list(x) for x in itertools.combinations(train_df_features_normalized.columns, 4)): regmodel.train(train_df_features_normalized[cols], train_df_targets) mse_train = eval.mean_squared_error(train_df_features_normalized[cols], train_df_targets) #print("Mean squared error on train data: %f" % ) #print("Mean squared error on test data: %f" % eval.mean_squared_error(train_df_features_normalized[cols], train_df_targets)) if mse_train < besttrainmse: bestcols = cols besttrainmse = mse_train print("Best training MSE = %f for columns:" % besttrainmse) print(bestcols) regmodel.train(train_df_features_normalized[bestcols], train_df_targets) print("Testing MSE of this model: %f" % eval.mean_squared_error(test_df_features_normalized[cols], test_df_targets)) print("\n*********************Feature Expansion*******************") df_train_featuregen = train_df_features_normalized.copy(deep=True) df_test_featuregen = test_df_features_normalized.copy(deep=True) #i = 0 for cols in list(list(x) for x in itertools.combinations(train_df_features_normalized.columns, 2)) + [[col,col] for col in train_df_features_normalized.columns]: #i += 1 #print("Gen %d: %s" % (i,cols[0]+cols[1])) #df_train_featuregen[cols[0]+cols[1]] = df_train_featuregen.apply(lambda x: [[x[cols[0]], x[cols[1]]]], axis=1) df_train_featuregen[cols[0]+cols[1]] = df_train_featuregen[cols[0]]*df_train_featuregen[cols[1]] df_test_featuregen[cols[0]+cols[1]] = df_test_featuregen[cols[0]]*df_test_featuregen[cols[1]] #df_test_featuregen[cols[0]+cols[1]] = df_test_featuregen.apply(lambda x: [[x[cols[0]], x[cols[1]]]], axis=1) regmodel = LinearRegression() regmodel.train(df_train_featuregen, train_df_targets) eval = ModelEvaluator(regmodel) trainingError = eval.mean_squared_error(df_train_featuregen, train_df_targets) print("Mean squared error on training data: %f" % trainingError) print("Mean squared error on test data: %f" % eval.mean_squared_error(df_test_featuregen, test_df_targets)) print("\n******************************** Showing histogram of attributes********************************") Histogrammer.plot_histgram_of_features(train_df, 3, 5) print("\nClose window to terminate") #plt.show(block=False) #.draw() #plt.pause(0.001) #raw_input("Press enter to continue") plt.show() return
parser = argparse.ArgumentParser() parser.add_argument("--config_path", default='../config.ini', required=False) args = parser.parse_args() cfg = OCTConfig(args.config_path) oct_logger = OCTLogger(cfg, RUN_TIMESTAMP) oct_logger.print_cfg() generator_resolver = GeneratorResolver(cfg) training_data_iterator, test_data_iterator, val_data_iterator = generator_resolver.resolve_data_iterators( ) model_resolver = ModelResolver(cfg) model = model_resolver.resolve_model() augmented_image_data_generator = generator_resolver.provide_augmented_image_data_generator( ) augmentation_processor = AugmentationProcessor( cfg, augmented_image_data_generator) augmentation_processor.perform_data_augmentation() model_trainer = ModelTrainer(cfg, model, training_data_iterator, val_data_iterator, RUN_TIMESTAMP) model_trainer.train_model() model_evaluator = ModelEvaluator(cfg, model, test_data_iterator) model_evaluator.evaluate_model()
def main(): logging.basicConfig(stream=sys.stdout, level=logging.DEBUG if DEBUG else logging.INFO) g_x_list = [] g_x_list.append(lambda xi: [1]) #g1 g_x_list.append(lambda xi: [1]) #g2 g_x_list.append(lambda xi: [1, xi]) #g3 g_x_list.append(lambda xi: [1, xi, xi*xi]) #g4 g_x_list.append(lambda xi: [1, xi, xi*xi, xi*xi*xi]) #g5 g_x_list.append(lambda xi: [1, xi, xi*xi, xi*xi*xi, xi*xi*xi*xi]) #g6 pretty_print_header("Bias Variance Trade-off") pretty_print_header("Generating datasets") datasets_10 = [] datasets_100 = [] for i in range(0, 100): datasets_10.append(generate_fx_dataset(10)) datasets_100.append(generate_fx_dataset(100)) pretty_print_header("(a) and (b)") for datasets, title in ((datasets_10, "Part (a)"), (datasets_100, "Part (b)")): avg_ws = [] mses_g = [] # g_1 mses_g1 = [] for i in range(0, 100): mses_g1.append(np.mean([(x[1] - 1)**2 for x in datasets[i]])) mses_g.append(mses_g1) avg_ws.append([1]) # g_2 mses_g2 = [] avg_w2 = 0 for i in range(0, len(datasets)): w_0 = sum([x[1] for x in datasets[i]]) / len(datasets) mses_g2.append(np.mean([(x[1] - w_0)**2 for x in datasets[i]])) avg_w2 += w_0 mses_g.append(mses_g2) avg_w2 /= len(datasets) avg_ws.append([avg_w2]) # g3-6 for g_x in g_x_list[2:]: wmean = [0]*len(g_x(0)) regmodel = KernelizedLinearRegression(g_x, 0) eval = ModelEvaluator(regmodel) meanerror = [] mses = [] for i in range(0, len(datasets)): regmodel.train([x[0] for x in datasets[i]], [x[1] for x in datasets[i]]) meansquarederror_dataseti = eval.mean_squared_error([x[0] for x in datasets[i]], [x[1] for x in datasets[i]]) mses.append(meansquarederror_dataseti) wmean = np.add(wmean, regmodel.w) mses_g.append(mses) wmean /= len(datasets) avg_ws.append(wmean) #print(mses_g) #print(np.subtract(mses_g[0],mses_g[1])) # print(mses_g[0]) # print(mses_g[1]) plot_histgram(pd.DataFrame(mses_g), 2, 3, "g", title) plt.show() pretty_print_header("Linear and Kernel SVM") train_data = loadmat('data/phishing-train.mat') test_data = loadmat('data/phishing-test.mat') train_features = train_data['features'] train_labels = train_data['label'] test_features = test_data['features'] test_labels = test_data['label'] train_df_features = pd.DataFrame(train_data['features']) train_df_labels = pd.DataFrame(train_data['label']) test_df_features = pd.DataFrame(test_data['features']) test_df_labels = pd.DataFrame(test_data['label']) categoricals_feature_columns = [1, 6, 7, 13, 14, 15, 25, 28] other_feature_columns = sorted(list(set(range(0,len(train_df_features.columns))) - set(categoricals_feature_columns))) df = train_df_features df_cat_train = pd.get_dummies(df[categoricals_feature_columns].applymap(str)) df_others = df[other_feature_columns] train_df_features = df_others.join(df_cat_train.applymap(float)) df = test_df_features df_cat_test = pd.get_dummies(df[categoricals_feature_columns].applymap(str)) df_others = df[other_feature_columns] test_df_features = df_others.join(df_cat_test[df_cat_train.columns]) # Feed to svmutil start = time.clock() crange = range(-6, 3) for c in crange: print("Evaluating svm for c=4^%d"%c) m = svm_train(train_labels.tolist()[0], train_df_features.values.tolist(), '-c %f -v 3' % math.pow(4,c)) print("Average training time=%fs" % ((time.clock() - start)/len(crange))) crange = range(-3,8) degrees = (1,2,3) start = time.clock() for c in crange: for degree in degrees: print("Evaluating svm for c=4^%d"%c) print("Degree = %d" % degree) m = svm_train(train_labels.tolist()[0], train_df_features.values.tolist(), '-t %d -c %f -v 3 -d %d' % (1, math.pow(4,c), degree)) print("Average training time=%fs" % ((time.clock() - start)/(len(crange)*len(degrees)))) gammedegrees = range(-7, -1) start = time.clock() for c in crange: for degree in gammedegrees: print("Evaluating svm for c=4^%d"%c) print("Gamme = %f" % math.pow(4,degree)) m = svm_train(train_labels.tolist()[0], train_df_features.values.tolist(), '-t %d -c %f -v 3 -g %f' % (2, math.pow(4,c), math.pow(4,degree))) print("Average training time=%fs" % ((time.clock() - start)/(len(crange)*len(degrees)))) #m = svm_load_model('heart_scale.model') #p_label, p_acc, p_val = svm_predict(y, x, m, '-b 1') #ACC, MSE, SCC = evaluations(y, p_label) #m = svm_train(train_labels.tolist()[0], train_df_features.values.tolist(), '-t %d -c %f -v 3 -g %f' % (2, math.pow(4,c), math.pow(4,degree))) m = svm_train(train_labels.tolist()[0], train_df_features.values.tolist(), '-t %d -c %f -g %f' % (2, math.pow(4,2), math.pow(4,-2))) p_labs, p_acc, p_vals = svm_predict(test_labels.tolist()[0], test_df_features.values.tolist(), m) print(p_acc) print("Best performing model: rbf-kernel svm with c=4^2, gamma = -2")
from evaluator import ZeroEvaluator from match import Match from mcts import MonteCarloTS from model_evaluator import ModelEvaluator from strategy_factory import StrategyFactory if __name__ == "__main__": modelFactory = StrategyFactory( MonteCarloTS, playerOneEvaluator=ModelEvaluator(1, "../models/6x6eval.tf"), playerTwoEvaluator=ModelEvaluator(-1, "../models/deepeval.tf"), maxSeconds=3, alpha=1) modelFactoryTwo = StrategyFactory( MonteCarloTS, playerOneEvaluator=ModelEvaluator(2, "../models/6x6eval.tf"), playerTwoEvaluator=ModelEvaluator(-1, "../models/deepeval.tf"), maxSeconds=3, alpha=1) won, lost, drawn = 0, 0, 0 for _ in range(100): for _ in range(100): match = Match(modelFactory, modelFactoryTwo) match.play() one, two = match.playerOne.state.score() if one > two: won += 1 elif two > one: lost += 1 else:
), Dense( second_layer, activation='relu', kernel_regularizer=regularizers.l2(0.01) ), Dense(1, activation='sigmoid') ] )) # Create model evaluator. model_evaluator = ModelEvaluator( dataset_extractor=dataset_extractor, seed_list=dataset_split_seeds, models=models, num_validation_runs=20, num_test_runs=100, percent_of_models_tested=0.2, evaluation_number=30 ) # Start timing model evaluation, if requested. if(args.show_evaluation_time): start_time = time.time() # Evaluate models. model_evaluator.evaluate_models() model_evaluator.save_results_as_csv() model_evaluator.print_results() # Print model evaluation time, if requested.