def main_eval(): print("load specified model") model = load_model(args.model, custom_objects=evaluation.get_metrics()) print("load evaluation image") img = dataset.load_image_eval(args.data) print("run evaluation on final year") y_pred = evaluation.predict_image(model, img, args.area_size) visualize.save_image_as(y_pred, "res/out.png")
def main_train_h5(): print("check for data.h5") try: open(args.h5data, "r") except FileNotFoundError: h5dataset.make_dataset(args.h5data) print("load remaining data") sat_images = dataset.load_sat_images(args.data) alt, slp = dataset.load_static_data(args.data) print("initialize training generator") train_gen = h5dataset.patch_generator_from_h5(args.h5data, sat_images, alt, slp, size=args.area_size, batch_size=args.batch_size, p=args.p_train) print("initialize validation generator") val_gen = h5dataset.patch_generator_from_h5(args.h5data, sat_images, alt, slp, size=args.area_size, batch_size=args.batch_size, p=args.p_val) print("get network") model = networks.get_model_by_name(args.model_type)(args) print("compile") custom_metrics = list(evaluation.get_metrics().values()) model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"] + custom_metrics) print(model.summary()) print("start training") model.fit_generator(train_gen, steps_per_epoch=args.steps_per_epoch, epochs=args.epochs, validation_data=val_gen, validation_steps=args.steps_per_val, verbose=True, max_q_size=args.queue_size, workers=1) print("store model") model.save(args.model)
def new_model(data1, data2): """ Training and testing a tree on the noisy dataset without those observations above Results in a 97% accuracy """ diff_obs = check_signals_only(data1, data2) df2 = pd.DataFrame(data2) clean_removed = pd.concat([df2, diff_obs, diff_obs]).drop_duplicates(keep=False) clean_removed_dataset = clean_removed.to_numpy() np.random.shuffle(clean_removed_dataset) split = 0.7 train = clean_removed_dataset[:int(len(clean_removed_dataset) * split)] test = clean_removed_dataset[int(len(clean_removed_dataset) * split):] model = trees.binarySearchTree(train) print('Max depth is', model.get_max_depth()) y_pred = model.predict(test[:, :-1]) cm = ev.confusion_matrix(test[:, -1], y_pred) i = ev.get_metrics(cm, printout=True) ev.plot_conf_matrix(cm)
def train_eval_all_folds(self, x_val, y_val): """Trains and evaluates models over all folds. Args: timestamp [string]: timestamp of the time the model is run, used as model identifier x_val [ndarray]: feature matrix y_val [ndarray]: label vector Returns: auc [float]: area under the ROC curve mean_fpr [list of floats]: false positive rate averaged over all kfolds mean_tpr [list of floats]: true positive rate averaged over all kfolds trained_model [sklearn object]: trained object to be pickled so that it can be used for scoring """ if self.world_type == "closed": # Why we use stratified k-fold here: # http://stats.stackexchange.com/questions/49540/understanding-stratified-cross-validation cv = cross_validation.StratifiedKFold(y_val, n_folds=self.k, shuffle=True) elif self.world_type == "open": pass # TODO fpr_arr, tpr_arr, metrics_all_folds = [], [], [] for i, (train, test) in enumerate(cv): fold_timestamp = datetime.datetime.now().isoformat() y_train, y_test = y_val[train], y_val[test] if self.feature_scaling: scaler = preprocessing.StandardScaler().fit(x_val[train]) x_train = scaler.transform(x_val[train]) x_test = scaler.transform(x_val[test]) else: x_train, x_test = x_val[train], x_val[test] trained_model = self.train_single_fold(x_train, y_train) pred_probs = self.score(x_test, trained_model) filename_kfold = '{}_{}_undefended_frontpage_{}_model_{}_fold_{}_world.pkl'.format( fold_timestamp, self.model_timestamp, self.model_type, i, self.world_type) fold_to_save = {'trained_object': trained_model, 'y_true': y_test, 'y_predicted': pred_probs} self.pickle_results(filename_kfold, fold_to_save) # Metrics computation # Compute ROC curve and area under the ROC curve eval_metrics = evaluation.get_metrics(y_test, pred_probs) metrics_all_folds.append(eval_metrics) fpr_arr.append(eval_metrics['fpr']) tpr_arr.append(eval_metrics['tpr']) # Save results of metrics in database self.db.save_fold_of_model(eval_metrics, self.model_timestamp, fold_timestamp) auc = evaluation.plot_allkfolds_ROC(self.model_timestamp, cv, fpr_arr, tpr_arr) print("Classifier {} trained! AUC: {}".format(self.model_timestamp, auc)) avg_metrics = evaluation.get_average_metrics(metrics_all_folds) # Save results of experiment (model evaluation averaged over all # folds) into the database self.db.save_full_model(avg_metrics, self.model_timestamp, self.__dict__)
def train_eval_all_folds(self, x_val, y_val): """Trains and evaluates models over all folds. Args: timestamp [string]: timestamp of the time the model is run, used as model identifier x_val [ndarray]: feature matrix y_val [ndarray]: label vector Returns: auc [float]: area under the ROC curve mean_fpr [list of floats]: false positive rate averaged over all kfolds mean_tpr [list of floats]: true positive rate averaged over all kfolds trained_model [sklearn object]: trained object to be pickled so that it can be used for scoring """ if self.world_type == "closed": # Why we use stratified k-fold here: # http://stats.stackexchange.com/questions/49540/understanding-stratified-cross-validation cv = cross_validation.StratifiedKFold(y_val, n_folds=self.k, shuffle=True) elif self.world_type == "open": pass # TODO fpr_arr, tpr_arr, metrics_all_folds = [], [], [] for i, (train, test) in enumerate(cv): fold_timestamp = datetime.datetime.now().isoformat() y_train, y_test = y_val[train], y_val[test] if self.feature_scaling: scaler = preprocessing.StandardScaler().fit(x_val[train]) x_train = scaler.transform(x_val[train]) x_test = scaler.transform(x_val[test]) else: x_train, x_test = x_val[train], x_val[test] trained_model = self.train_single_fold(x_train, y_train) pred_probs = self.score(x_test, trained_model) filename_kfold = '{}_{}_undefended_frontpage_{}_model_{}_fold_{}_world.pkl'.format( fold_timestamp, self.model_timestamp, self.model_type, i, self.world_type) fold_to_save = { 'trained_object': trained_model, 'y_true': y_test, 'y_predicted': pred_probs } self.pickle_results(filename_kfold, fold_to_save) # Metrics computation # Compute ROC curve and area under the ROC curve eval_metrics = evaluation.get_metrics(y_test, pred_probs) metrics_all_folds.append(eval_metrics) fpr_arr.append(eval_metrics['fpr']) tpr_arr.append(eval_metrics['tpr']) # Save results of metrics in database self.db.save_fold_of_model(eval_metrics, self.model_timestamp, fold_timestamp) auc = evaluation.plot_allkfolds_ROC(self.model_timestamp, cv, fpr_arr, tpr_arr) print("Classifier {} trained! AUC: {}".format(self.model_timestamp, auc)) avg_metrics = evaluation.get_average_metrics(metrics_all_folds) # Save results of experiment (model evaluation averaged over all # folds) into the database self.db.save_full_model(avg_metrics, self.model_timestamp, self.__dict__)
print("Loading data...") X, Y = load_dataset() print("Training model") t0 = time() transformer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.5) X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42) X_train = transformer.fit_transform(X_train) X_test = transformer.transform(X_test) model = MultinomialNB() clf = model.fit(X_train, y_train) train_time = time() - t0 print("Finished") print("\t- train time: %0.3fs" % train_time) t0 = time() y_pred = clf.predict(X_test) print(classification_report(y_test, y_pred)) test_time = time() - t0 print("\t- test time: %0.3fs" % test_time) get_metrics(y_test, y_pred) save_model("model/model.pkl", clf)
if limit == '': print('No limit entered') limit = None else: limit = int(limit) np.random.shuffle(data) train = data[:int(len(data) * split)] test = data[int(len(data) * split):] model = binarySearchTree(train, limit=limit) print('Max depth of tree is', model.get_max_depth()) y_pred = model.predict(test[:, :-1]) cm = ev.confusion_matrix(test[:, -1], y_pred) i = ev.get_metrics(cm, printout=True) print('To continue, you may need to close the plot windows first') ev.plot_conf_matrix(cm) print('Visualising the pruned trees') model.visualise_tree() input('\nTo restart, hit enter\n') if model == '2': split = float(input('Enter training data split value, eg 0.7\n')) while True: if split < 0 or split > 1: print('Invalid split entered') else: break print('You have entered ' + str(split) + '\n')
def ensemble_learning(directory_name, data, X, y, baseline=-1, model_num=None, resample=0, feature_set=None, feature_importance=0, average_method='macro', path=None): """ Store the results calculated according to the arguments and store them in a file. Arguments: directory_name (str): the directory under which the files should be stored data (dataframe): the whole dataset X (dataframe): examples y (dataframe): target/label baseline (int): -1 for no baseline, 1 for all predictions as 1, 0 for all predictions as 0 model_num (int): classification model 1: 2: 3: 4: 5: 6: resample (int): -1 for undersampling, 1 for oversampling and 0 for no resampling feature_set (list): list of features to be considered feature_importance (int): 0 for absent, 1 for present average_method: macro by default path: the path to the directory where the recordings should be stored """ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) #prepare the dictionary to be written to the file data_dict = dict() metrics_dict = dict() dir_name = path + directory_name + '/' os.mkdir(dir_name) #open the config file for writing config_file = open(dir_name + 'config.json', 'w') #open the metrics file for writing metrics_file = open(dir_name + 'metrics.json', 'w') data_dict = {'model_num': model_num} data_dict = {'baseline': baseline} data_dict.update({'resample': resample}) data_dict.update({'feature_set': feature_set}) data_dict.update({'n_features': n_features}) data_dict.update({'feature_importance': feature_importance}) ''' #create test set labels for the baseline if applicable if baseline == 0: y_test = y_test.replace(1,0) elif baseline == 1: y_test = y_test.replace(0,1) ''' #resample the training set (if applicable) if resample == -1: #undersample '''NearMiss 3 . NearMiss-3 is a 2-step algorithm: first, for each minority sample, their :m nearest-neighbors will be kept; then, the majority samples selected are the on for which the average distance to the k nearest neighbors is the largest.''' nm = NearMiss(version=3) print(sorted(Counter(y_train).items())) X_resampled, y_resampled = nm.fit_resample(X_train, y_train) X_train = X_resampled y_train = y_resampled print(str(sorted(Counter(y_train).items()))) elif resample == 1: #oversample X_resampled, y_resampled = SMOTE().fit_resample(X_train, y_train) X_train = X_resampled y_train = y_resampled print(sorted(Counter(y_resampled).items())) #write the training dataset class distribution to the file file = open(dir_name + 'train_val_dist.csv', 'a') file.write(str(sorted(Counter(y_train).items()))) file.write('\n') file.close() model = get_model(model_num) model.fit(X_train, y_train) y_pred = model.predict(X_test) if baseline == 0: y_pred = y_pred.replace(1, 0) elif baseline == 1: y_pred = y_pred.replace(0, 1) plot_lc(model=model, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=777), X=X, y=y) #evaluation metrics = get_metrics(y_test, y_pred) for key, value in metrics.items(): metrics_dict[key] = value #correlation correlation(data) #linearity test_for_linearity(X_train, y_train) #homoscedasticity test_for_homoscedasticity(X_train, y_train, X_test, y_test) ''' #learning curve #if model_num == 7: cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0) #else: #cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=777) train_sizes, train_scores, test_scores = learning_curve(estimator = model, X = data[feature_set], y = data['label'], cv = cv, scoring = 'f1_macro', train_sizes=np.linspace(.1, 1.0, 10)) # Create means and standard deviations of training set scores train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) print('scores: ', train_scores, train_mean) # Create means and standard deviations of test set scores test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) # Draw lines print('Learning Curve') plt.plot(train_sizes, train_mean, '--', color="#111111", label="Training score") plt.plot(train_sizes, test_mean, color="#111111", label="Cross-validation score") # Draw bands plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="#DDDDDD") plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="#DDDDDD") # Create plot plt.title("Learning Curve") plt.xlabel("Training Set Size"), plt.ylabel("Macro-F1 Score"), plt.legend(loc="best") plt.tight_layout() plt.show() ''' plot_learning_curves(X_train, y_train, X_test, y_test, model, scoring='f1_macro') plt.show() if feature_importance == 1: feat_importances = pd.Series(model.feature_importances_, index=feature_set) print(feature_set) print('Feat: ', feat_importances) feat_importances.nlargest(20).plot(kind='barh') #plot_importance(model) plt.show() perm = PermutationImportance(model, random_state=1).fit(X_train, y_train) display(eli5.show_weights(perm, feature_names=X_train.columns.tolist())) #write the training dataset class distribution to the file file = open(dir_name + 'feature_importances.csv', 'a') for ind in range(0, len(feature_set)): file.write(feature_set[ind] + ',' + str(feat_importances[ind]) + '\n') file.close() #write the permutation feature importance decrease in error values to the file file = open(dir_name + 'permutation_feature_importances.csv', 'a') print(perm.feature_importances_) for ind in range(0, len(feature_set)): file.write(feature_set[ind] + ',' + str(perm.feature_importances_[ind]) + '\n') file.close() #write the scores to the file json.dump(metrics_dict, metrics_file) metrics_file.close() #write the configuration values to the file json.dump(data_dict, config_file) config_file.close()
""" run eval on a pair of corpus files """ import sys sys.path.append('../../src/style_transfer_baseline') import evaluation import models src_path = sys.argv[1] pred_path = sys.argv[2] tgt_path = sys.argv[3] classifier_path = "../../data/v2/eval_classifier" eval_classifier = models.TextClassifier.from_pickle( "../../data/v2/eval_classifier") src = [x.strip().split() for x in open(src_path)] pred = [x.strip().split() for x in open(pred_path)] tgt = [x.strip().split() for x in open(tgt_path)] print(evaluation.get_metrics(src, pred, tgt, classifier=eval_classifier))
with open('log.txt', 'w') as file: file.write('k, knn_frac, min_overlap, map_k, cosine\n') for i in range(len(k_vals)): for j in range(len(knn_frac_vals)): for k in range(len(min_overlap_vals)): print(song_df.shape) tuning_model = ALSpkNN(user_df, song_df, k_vals[i], knn_frac_vals[j], min_overlap_vals[k], cf_weighting_alpha=1) print("Fitting model...") tuning_model.fit(train_plays) metrics = get_metrics( metrics=['MAP@K', 'mean_cosine_list_dissimilarity'], N=20, model=tuning_model, train_user_items=train_plays.transpose(), test_user_items=test_plays.transpose(), song_df=song_df, limit=10) mapk = metrics['MAP@K'] cosdis = metrics['cosine_list_dissimilarity'] with open('log.txt', 'a') as file: file.write( f'{k_vals[i]},{knn_frac_vals[j]},{min_overlap_vals[k]},{mapk},{cosdis}\n' )
def crossvalidate(directory_name, splits, data, X, y, baseline=-1, model_num=None, resample=0, feature_set=None, feature_importance=0, average_method='macro', path=None): """ Store the results calculated according to the arguments and store them in a file. Arguments: directory_name (str): the directory under which the files should be stored splits (int): number of folds data (dataframe): the whole dataset X (dataframe): examples y (dataframe): target/label baseline (int): -1 for no baseline, 1 for all predictions as 1, 0 for all predictions as 0 model_num (int): classification model 1: 2: 3: 4: 5: 6: resample (int): -1 for undersampling, 1 for oversampling and 0 for no resampling feature_set (list): list of features to be considered feature_importance (int): 0 for absent, 1 for present average_method: macro by default path: the path to the directory where the recordings should be stored """ #prepare the dictionary to be written to the file data_dict = dict() metrics_dict = dict() dir_name = path + directory_name + '/' os.mkdir(dir_name) #create a directory for each split for fold in range(1, splits + 1): os.mkdir(dir_name + str(fold)) print(dir_name + str(fold)) #open the config file for writing config_file = open(dir_name + 'config.json', 'w') #open the metrics file for writing metrics_file = open(dir_name + 'metrics.json', 'w') data_dict = {'model_num': model_num} data_dict = {'baseline': baseline} data_dict.update({'resample': resample}) data_dict.update({'feature_set': feature_set}) data_dict.update({'n_features': n_features}) data_dict.update({'feature_importance': feature_importance}) metrics_dict = dict() metrics_dict['f1_macro'] = list() metrics_dict['tpr'] = list() metrics_dict['tnr'] = list() metrics_dict['fpr'] = list() metrics_dict['precision'] = list() metrics_dict['recall'] = list() metrics_dict['accuracy'] = list() metrics_dict['f1'] = list() model = get_model(model_num) kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=777) #if model_num == 3: #kfold = ShuffleSplit(n_splits=splits, test_size=0.2, random_state=0) plot_lc(model=model, cv=kfold, X=X, y=y, resample=resample) #linearity test_for_linearity(X, y) i = 0 for train_index, test_index in kfold.split(X, y): #create train-test splits X_train, y_train = X.iloc[train_index], y.iloc[train_index] X_test, y_test = X.iloc[test_index], y.iloc[test_index] ''' #create test set labels for the baseline if applicable if baseline == 0: y_test = y_test.replace(1,0) elif baseline == 1: y_test = y_test.replace(0,1) ''' #resample the training set (if applicable) if resample == -1: #undersample '''NearMiss 3 . NearMiss-3 is a 2-step algorithm: first, for each minority sample, their :m nearest-neighbors will be kept; then, the majority samples selected are the on for which the average distance to the k nearest neighbors is the largest.''' nm = NearMiss(version=3) print(str(sorted(Counter(y_train).items()))) X_resampled, y_resampled = nm.fit_resample(X_train, y_train) X_train = X_resampled y_train = y_resampled print(sorted(Counter(y_train).items())) elif resample == 1: #oversample X_resampled, y_resampled = SMOTE().fit_resample(X_train, y_train) X_train = X_resampled y_train = y_resampled print(sorted(Counter(y_resampled).items())) #write the training dataset class distribution to the file file = open(dir_name + str(i + 1) + '/train_val_dist.csv', 'a') file.write(str(sorted(Counter(y_train).items()))) file.write('\n') file.close() model.fit(X_train, y_train) y_pred = model.predict(X_test) if baseline == 0: y_pred = y_pred.replace(1, 0) elif baseline == 1: y_pred = y_pred.replace(0, 1) metrics = get_metrics(y_test, y_pred) for key, value in metrics.items(): metrics_dict[key].append(value) #homoscedasticity test_for_homoscedasticity(X_train, y_train, X_test, y_test) #correlation correlation(data) if feature_importance == 1: if model_num == 1: feat_importances = pd.Series(model.feature_importances_, index=X.columns) elif model_num == 3: feat_importances = pd.Series(abs(svm.coef_[0]), index=X.columns) if model_num != 2: print('Feat. Imp.: ', feat_importances) feat_importances.nlargest(20).plot(kind='barh') #plot_importance(model) plt.show() #write the feature importance values to the file file = open(dir_name + str(i + 1) + '/feature_importances.csv', 'a') for ind in range(0, len(feature_set)): file.write(feature_set[ind] + ',' + str(feat_importances[ind]) + '\n') file.close() perm = PermutationImportance(model, random_state=1).fit(X_train, y_train) print('PERM: ', perm.feature_importances_) display( eli5.show_weights(perm, feature_names=X_train.columns.tolist())) #write the permutation feature importance decrease in error values to the file file = open( dir_name + str(i + 1) + '/permutation_feature_importances.csv', 'a') for ind in range(0, len(feature_set)): file.write(feature_set[ind] + ',' + str(perm.feature_importances_[ind]) + '\n') file.write('\n') file.close() i += 1 for key, values in metrics_dict.items(): metrics_dict[key] = sum(values) / len(values) #write the scores to the file json.dump(metrics_dict, metrics_file) metrics_file.close() #write the configuration values to the file json.dump(data_dict, config_file) config_file.close()