Esempio n. 1
0
def main_eval():
    print("load specified model")
    model = load_model(args.model, custom_objects=evaluation.get_metrics())
    print("load evaluation image")
    img = dataset.load_image_eval(args.data)
    print("run evaluation on final year")
    y_pred = evaluation.predict_image(model, img, args.area_size)
    visualize.save_image_as(y_pred, "res/out.png")
Esempio n. 2
0
def main_train_h5():
    print("check for data.h5")
    try:
        open(args.h5data, "r")
    except FileNotFoundError:
        h5dataset.make_dataset(args.h5data)
    print("load remaining data")
    sat_images = dataset.load_sat_images(args.data)
    alt, slp = dataset.load_static_data(args.data)
    print("initialize training generator")
    train_gen = h5dataset.patch_generator_from_h5(args.h5data,
                                                  sat_images,
                                                  alt,
                                                  slp,
                                                  size=args.area_size,
                                                  batch_size=args.batch_size,
                                                  p=args.p_train)
    print("initialize validation generator")
    val_gen = h5dataset.patch_generator_from_h5(args.h5data,
                                                sat_images,
                                                alt,
                                                slp,
                                                size=args.area_size,
                                                batch_size=args.batch_size,
                                                p=args.p_val)
    print("get network")
    model = networks.get_model_by_name(args.model_type)(args)
    print("compile")
    custom_metrics = list(evaluation.get_metrics().values())
    model.compile(optimizer="adam",
                  loss="binary_crossentropy",
                  metrics=["accuracy"] + custom_metrics)
    print(model.summary())
    print("start training")
    model.fit_generator(train_gen,
                        steps_per_epoch=args.steps_per_epoch,
                        epochs=args.epochs,
                        validation_data=val_gen,
                        validation_steps=args.steps_per_val,
                        verbose=True,
                        max_q_size=args.queue_size,
                        workers=1)
    print("store model")
    model.save(args.model)
Esempio n. 3
0
def new_model(data1, data2):
    """
    Training and testing a tree on the noisy dataset without those observations above
    Results in a 97% accuracy
    """
    diff_obs = check_signals_only(data1, data2)
    df2 = pd.DataFrame(data2)
    clean_removed = pd.concat([df2, diff_obs,
                               diff_obs]).drop_duplicates(keep=False)
    clean_removed_dataset = clean_removed.to_numpy()

    np.random.shuffle(clean_removed_dataset)
    split = 0.7
    train = clean_removed_dataset[:int(len(clean_removed_dataset) * split)]
    test = clean_removed_dataset[int(len(clean_removed_dataset) * split):]

    model = trees.binarySearchTree(train)
    print('Max depth is', model.get_max_depth())
    y_pred = model.predict(test[:, :-1])
    cm = ev.confusion_matrix(test[:, -1], y_pred)
    i = ev.get_metrics(cm, printout=True)
    ev.plot_conf_matrix(cm)
    def train_eval_all_folds(self, x_val, y_val):
        """Trains and evaluates models over all folds.

        Args:
            timestamp [string]: timestamp of the time the model is run,
                                used as model identifier
            x_val [ndarray]: feature matrix
            y_val [ndarray]: label vector

        Returns:
            auc [float]: area under the ROC curve
            mean_fpr [list of floats]: false positive rate averaged over
                                       all kfolds
            mean_tpr [list of floats]: true positive rate averaged over
                                       all kfolds
            trained_model [sklearn object]: trained object to be pickled
                                            so that it can be used for
                                            scoring 
        """

        if self.world_type == "closed":
            # Why we use stratified k-fold here:
            # http://stats.stackexchange.com/questions/49540/understanding-stratified-cross-validation
            cv = cross_validation.StratifiedKFold(y_val, n_folds=self.k,
                                                  shuffle=True)
        elif self.world_type == "open":
            pass  # TODO

        fpr_arr, tpr_arr, metrics_all_folds = [], [], []
        for i, (train, test) in enumerate(cv):

            fold_timestamp = datetime.datetime.now().isoformat()
            y_train, y_test = y_val[train], y_val[test]

            if self.feature_scaling:
                scaler = preprocessing.StandardScaler().fit(x_val[train])
                x_train = scaler.transform(x_val[train])
                x_test = scaler.transform(x_val[test])
            else:
                x_train, x_test = x_val[train], x_val[test]

            trained_model = self.train_single_fold(x_train, y_train)
            pred_probs = self.score(x_test, trained_model)

            filename_kfold = '{}_{}_undefended_frontpage_{}_model_{}_fold_{}_world.pkl'.format(
                fold_timestamp, self.model_timestamp, self.model_type,  i, self.world_type)
            fold_to_save = {'trained_object': trained_model,
                            'y_true': y_test, 'y_predicted': pred_probs}
            self.pickle_results(filename_kfold, fold_to_save)

            # Metrics computation
            # Compute ROC curve and area under the ROC curve
            eval_metrics = evaluation.get_metrics(y_test, pred_probs)
            metrics_all_folds.append(eval_metrics)
            fpr_arr.append(eval_metrics['fpr'])
            tpr_arr.append(eval_metrics['tpr'])

            # Save results of metrics in database
            self.db.save_fold_of_model(eval_metrics, self.model_timestamp, fold_timestamp)

        auc = evaluation.plot_allkfolds_ROC(self.model_timestamp, cv,
                                            fpr_arr, tpr_arr)

        print("Classifier {} trained! AUC: {}".format(self.model_timestamp,
                                                      auc))

        avg_metrics = evaluation.get_average_metrics(metrics_all_folds)
        # Save results of experiment (model evaluation averaged over all
        # folds) into the database
        self.db.save_full_model(avg_metrics, self.model_timestamp, self.__dict__)
Esempio n. 5
0
    def train_eval_all_folds(self, x_val, y_val):
        """Trains and evaluates models over all folds.

        Args:
            timestamp [string]: timestamp of the time the model is run,
                                used as model identifier
            x_val [ndarray]: feature matrix
            y_val [ndarray]: label vector

        Returns:
            auc [float]: area under the ROC curve
            mean_fpr [list of floats]: false positive rate averaged over
                                       all kfolds
            mean_tpr [list of floats]: true positive rate averaged over
                                       all kfolds
            trained_model [sklearn object]: trained object to be pickled
                                            so that it can be used for
                                            scoring 
        """

        if self.world_type == "closed":
            # Why we use stratified k-fold here:
            # http://stats.stackexchange.com/questions/49540/understanding-stratified-cross-validation
            cv = cross_validation.StratifiedKFold(y_val,
                                                  n_folds=self.k,
                                                  shuffle=True)
        elif self.world_type == "open":
            pass  # TODO

        fpr_arr, tpr_arr, metrics_all_folds = [], [], []
        for i, (train, test) in enumerate(cv):

            fold_timestamp = datetime.datetime.now().isoformat()
            y_train, y_test = y_val[train], y_val[test]

            if self.feature_scaling:
                scaler = preprocessing.StandardScaler().fit(x_val[train])
                x_train = scaler.transform(x_val[train])
                x_test = scaler.transform(x_val[test])
            else:
                x_train, x_test = x_val[train], x_val[test]

            trained_model = self.train_single_fold(x_train, y_train)
            pred_probs = self.score(x_test, trained_model)

            filename_kfold = '{}_{}_undefended_frontpage_{}_model_{}_fold_{}_world.pkl'.format(
                fold_timestamp, self.model_timestamp, self.model_type, i,
                self.world_type)
            fold_to_save = {
                'trained_object': trained_model,
                'y_true': y_test,
                'y_predicted': pred_probs
            }
            self.pickle_results(filename_kfold, fold_to_save)

            # Metrics computation
            # Compute ROC curve and area under the ROC curve
            eval_metrics = evaluation.get_metrics(y_test, pred_probs)
            metrics_all_folds.append(eval_metrics)
            fpr_arr.append(eval_metrics['fpr'])
            tpr_arr.append(eval_metrics['tpr'])

            # Save results of metrics in database
            self.db.save_fold_of_model(eval_metrics, self.model_timestamp,
                                       fold_timestamp)

        auc = evaluation.plot_allkfolds_ROC(self.model_timestamp, cv, fpr_arr,
                                            tpr_arr)

        print("Classifier {} trained! AUC: {}".format(self.model_timestamp,
                                                      auc))

        avg_metrics = evaluation.get_average_metrics(metrics_all_folds)
        # Save results of experiment (model evaluation averaged over all
        # folds) into the database
        self.db.save_full_model(avg_metrics, self.model_timestamp,
                                self.__dict__)
Esempio n. 6
0
print("Loading data...")
X, Y = load_dataset()

print("Training model")
t0 = time()
transformer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.5)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=42)

X_train = transformer.fit_transform(X_train)
X_test = transformer.transform(X_test)

model = MultinomialNB()

clf = model.fit(X_train, y_train)
train_time = time() - t0
print("Finished")
print("\t- train time: %0.3fs" % train_time)

t0 = time()
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
test_time = time() - t0
print("\t- test time: %0.3fs" % test_time)

get_metrics(y_test, y_pred)
save_model("model/model.pkl", clf)
Esempio n. 7
0
            if limit == '':
                print('No limit entered')
                limit = None
            else:
                limit = int(limit)

            np.random.shuffle(data)
            train = data[:int(len(data) * split)]
            test = data[int(len(data) * split):]

            model = binarySearchTree(train, limit=limit)
            print('Max depth of tree is', model.get_max_depth())

            y_pred = model.predict(test[:, :-1])
            cm = ev.confusion_matrix(test[:, -1], y_pred)
            i = ev.get_metrics(cm, printout=True)
            print('To continue, you may need to close the plot windows first')
            ev.plot_conf_matrix(cm)
            print('Visualising the pruned trees')
            model.visualise_tree()

            input('\nTo restart, hit enter\n')

        if model == '2':
            split = float(input('Enter training data split value, eg 0.7\n'))
            while True:
                if split < 0 or split > 1:
                    print('Invalid split entered')
                else:
                    break
            print('You have entered ' + str(split) + '\n')
def ensemble_learning(directory_name,
                      data,
                      X,
                      y,
                      baseline=-1,
                      model_num=None,
                      resample=0,
                      feature_set=None,
                      feature_importance=0,
                      average_method='macro',
                      path=None):
    """
    Store the results calculated according to the arguments and store them in a file.
    Arguments:
    directory_name (str): the directory under which the files should be stored
    data (dataframe): the whole dataset
    X (dataframe): examples
    y (dataframe): target/label
    baseline (int): -1 for no baseline, 1 for all predictions as 1, 0 for all predictions as 0
    model_num (int): classification model
    1: 
    2:
    3:
    4:
    5:
    6:
    resample (int): -1 for undersampling, 1 for oversampling and 0 for no resampling
    feature_set (list): list of features to be considered
    feature_importance (int): 0 for absent, 1 for present
    average_method: macro by default
    path: the path to the directory where the recordings should be stored
    """
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=42)

    #prepare the dictionary to be written to the file
    data_dict = dict()
    metrics_dict = dict()

    dir_name = path + directory_name + '/'
    os.mkdir(dir_name)

    #open the config file for writing
    config_file = open(dir_name + 'config.json', 'w')
    #open the metrics file for writing
    metrics_file = open(dir_name + 'metrics.json', 'w')

    data_dict = {'model_num': model_num}
    data_dict = {'baseline': baseline}
    data_dict.update({'resample': resample})
    data_dict.update({'feature_set': feature_set})
    data_dict.update({'n_features': n_features})
    data_dict.update({'feature_importance': feature_importance})
    '''
    #create test set labels for the baseline if applicable
    if baseline == 0:
        y_test = y_test.replace(1,0)
    elif baseline == 1:
        y_test = y_test.replace(0,1)
    '''

    #resample the training set (if applicable)
    if resample == -1:
        #undersample
        '''NearMiss 3 . NearMiss-3 is a 2-step algorithm: first, for each minority sample, 
        their :m nearest-neighbors will be kept; then, the majority samples selected are the 
        on for which the average distance to the k nearest neighbors is the largest.'''
        nm = NearMiss(version=3)
        print(sorted(Counter(y_train).items()))
        X_resampled, y_resampled = nm.fit_resample(X_train, y_train)
        X_train = X_resampled
        y_train = y_resampled
        print(str(sorted(Counter(y_train).items())))
    elif resample == 1:
        #oversample
        X_resampled, y_resampled = SMOTE().fit_resample(X_train, y_train)
        X_train = X_resampled
        y_train = y_resampled
        print(sorted(Counter(y_resampled).items()))
    #write the training dataset class distribution to the file
    file = open(dir_name + 'train_val_dist.csv', 'a')
    file.write(str(sorted(Counter(y_train).items())))
    file.write('\n')
    file.close()

    model = get_model(model_num)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    if baseline == 0:
        y_pred = y_pred.replace(1, 0)
    elif baseline == 1:
        y_pred = y_pred.replace(0, 1)

    plot_lc(model=model,
            cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=777),
            X=X,
            y=y)

    #evaluation
    metrics = get_metrics(y_test, y_pred)
    for key, value in metrics.items():
        metrics_dict[key] = value

    #correlation
    correlation(data)

    #linearity
    test_for_linearity(X_train, y_train)

    #homoscedasticity
    test_for_homoscedasticity(X_train, y_train, X_test, y_test)
    '''
    #learning curve
    #if model_num == 7:
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    #else:
    #cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=777)
    train_sizes, train_scores, test_scores = learning_curve(estimator = model, X = data[feature_set], y = data['label'], cv = cv, scoring = 'f1_macro', train_sizes=np.linspace(.1, 1.0, 10))
    # Create means and standard deviations of training set scores
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    print('scores: ', train_scores, train_mean)
    # Create means and standard deviations of test set scores
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    # Draw lines
    print('Learning Curve')
    plt.plot(train_sizes, train_mean, '--', color="#111111",  label="Training score")
    plt.plot(train_sizes, test_mean, color="#111111", label="Cross-validation score")
    # Draw bands
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="#DDDDDD")
    plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="#DDDDDD")

    # Create plot
    plt.title("Learning Curve")
    plt.xlabel("Training Set Size"), plt.ylabel("Macro-F1 Score"), plt.legend(loc="best")
    plt.tight_layout()
    plt.show()
    '''
    plot_learning_curves(X_train,
                         y_train,
                         X_test,
                         y_test,
                         model,
                         scoring='f1_macro')
    plt.show()

    if feature_importance == 1:
        feat_importances = pd.Series(model.feature_importances_,
                                     index=feature_set)
        print(feature_set)
        print('Feat: ', feat_importances)
        feat_importances.nlargest(20).plot(kind='barh')
        #plot_importance(model)
        plt.show()

        perm = PermutationImportance(model,
                                     random_state=1).fit(X_train, y_train)
        display(eli5.show_weights(perm,
                                  feature_names=X_train.columns.tolist()))

        #write the training dataset class distribution to the file
        file = open(dir_name + 'feature_importances.csv', 'a')
        for ind in range(0, len(feature_set)):
            file.write(feature_set[ind] + ',' + str(feat_importances[ind]) +
                       '\n')
        file.close()

        #write the permutation feature importance decrease in error values to the file
        file = open(dir_name + 'permutation_feature_importances.csv', 'a')
        print(perm.feature_importances_)
        for ind in range(0, len(feature_set)):
            file.write(feature_set[ind] + ',' +
                       str(perm.feature_importances_[ind]) + '\n')
        file.close()

    #write the scores to the file
    json.dump(metrics_dict, metrics_file)
    metrics_file.close()

    #write the configuration values to the file
    json.dump(data_dict, config_file)
    config_file.close()
Esempio n. 9
0
"""
run eval on a pair of corpus files

"""
import sys
sys.path.append('../../src/style_transfer_baseline')
import evaluation
import models

src_path = sys.argv[1]
pred_path = sys.argv[2]
tgt_path = sys.argv[3]
classifier_path = "../../data/v2/eval_classifier"

eval_classifier = models.TextClassifier.from_pickle(
    "../../data/v2/eval_classifier")

src = [x.strip().split() for x in open(src_path)]
pred = [x.strip().split() for x in open(pred_path)]
tgt = [x.strip().split() for x in open(tgt_path)]

print(evaluation.get_metrics(src, pred, tgt, classifier=eval_classifier))
Esempio n. 10
0
with open('log.txt', 'w') as file:
    file.write('k, knn_frac, min_overlap, map_k, cosine\n')

for i in range(len(k_vals)):
    for j in range(len(knn_frac_vals)):
        for k in range(len(min_overlap_vals)):
            print(song_df.shape)
            tuning_model = ALSpkNN(user_df,
                                   song_df,
                                   k_vals[i],
                                   knn_frac_vals[j],
                                   min_overlap_vals[k],
                                   cf_weighting_alpha=1)
            print("Fitting model...")
            tuning_model.fit(train_plays)
            metrics = get_metrics(
                metrics=['MAP@K', 'mean_cosine_list_dissimilarity'],
                N=20,
                model=tuning_model,
                train_user_items=train_plays.transpose(),
                test_user_items=test_plays.transpose(),
                song_df=song_df,
                limit=10)

            mapk = metrics['MAP@K']
            cosdis = metrics['cosine_list_dissimilarity']

            with open('log.txt', 'a') as file:
                file.write(
                    f'{k_vals[i]},{knn_frac_vals[j]},{min_overlap_vals[k]},{mapk},{cosdis}\n'
                )
Esempio n. 11
0
def crossvalidate(directory_name,
                  splits,
                  data,
                  X,
                  y,
                  baseline=-1,
                  model_num=None,
                  resample=0,
                  feature_set=None,
                  feature_importance=0,
                  average_method='macro',
                  path=None):
    """
    Store the results calculated according to the arguments and store them in a file.
    Arguments:
    directory_name (str): the directory under which the files should be stored
    splits (int): number of folds
    data (dataframe): the whole dataset
    X (dataframe): examples
    y (dataframe): target/label
    baseline (int): -1 for no baseline, 1 for all predictions as 1, 0 for all predictions as 0
    model_num (int): classification model
    1: 
    2:
    3:
    4:
    5:
    6:
    resample (int): -1 for undersampling, 1 for oversampling and 0 for no resampling
    feature_set (list): list of features to be considered
    feature_importance (int): 0 for absent, 1 for present
    average_method: macro by default
    path: the path to the directory where the recordings should be stored
    """

    #prepare the dictionary to be written to the file
    data_dict = dict()
    metrics_dict = dict()

    dir_name = path + directory_name + '/'
    os.mkdir(dir_name)
    #create a directory for each split
    for fold in range(1, splits + 1):
        os.mkdir(dir_name + str(fold))
        print(dir_name + str(fold))
    #open the config file for writing
    config_file = open(dir_name + 'config.json', 'w')
    #open the metrics file for writing
    metrics_file = open(dir_name + 'metrics.json', 'w')

    data_dict = {'model_num': model_num}
    data_dict = {'baseline': baseline}
    data_dict.update({'resample': resample})
    data_dict.update({'feature_set': feature_set})
    data_dict.update({'n_features': n_features})
    data_dict.update({'feature_importance': feature_importance})

    metrics_dict = dict()
    metrics_dict['f1_macro'] = list()
    metrics_dict['tpr'] = list()
    metrics_dict['tnr'] = list()
    metrics_dict['fpr'] = list()
    metrics_dict['precision'] = list()
    metrics_dict['recall'] = list()
    metrics_dict['accuracy'] = list()
    metrics_dict['f1'] = list()

    model = get_model(model_num)
    kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=777)
    #if model_num == 3:
    #kfold = ShuffleSplit(n_splits=splits, test_size=0.2, random_state=0)

    plot_lc(model=model, cv=kfold, X=X, y=y, resample=resample)
    #linearity
    test_for_linearity(X, y)

    i = 0
    for train_index, test_index in kfold.split(X, y):
        #create train-test splits
        X_train, y_train = X.iloc[train_index], y.iloc[train_index]
        X_test, y_test = X.iloc[test_index], y.iloc[test_index]
        '''
        #create test set labels for the baseline if applicable
        if baseline == 0:
            y_test = y_test.replace(1,0)
        elif baseline == 1:
            y_test = y_test.replace(0,1)
        '''
        #resample the training set (if applicable)
        if resample == -1:
            #undersample
            '''NearMiss 3 . NearMiss-3 is a 2-step algorithm: first, for each minority sample, 
            their :m nearest-neighbors will be kept; then, the majority samples selected are the 
            on for which the average distance to the k nearest neighbors is the largest.'''
            nm = NearMiss(version=3)
            print(str(sorted(Counter(y_train).items())))
            X_resampled, y_resampled = nm.fit_resample(X_train, y_train)
            X_train = X_resampled
            y_train = y_resampled
            print(sorted(Counter(y_train).items()))
        elif resample == 1:
            #oversample
            X_resampled, y_resampled = SMOTE().fit_resample(X_train, y_train)
            X_train = X_resampled
            y_train = y_resampled
            print(sorted(Counter(y_resampled).items()))
        #write the training dataset class distribution to the file
        file = open(dir_name + str(i + 1) + '/train_val_dist.csv', 'a')
        file.write(str(sorted(Counter(y_train).items())))
        file.write('\n')
        file.close()

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        if baseline == 0:
            y_pred = y_pred.replace(1, 0)
        elif baseline == 1:
            y_pred = y_pred.replace(0, 1)

        metrics = get_metrics(y_test, y_pred)
        for key, value in metrics.items():
            metrics_dict[key].append(value)

        #homoscedasticity
        test_for_homoscedasticity(X_train, y_train, X_test, y_test)

        #correlation
        correlation(data)

        if feature_importance == 1:
            if model_num == 1:
                feat_importances = pd.Series(model.feature_importances_,
                                             index=X.columns)
            elif model_num == 3:
                feat_importances = pd.Series(abs(svm.coef_[0]),
                                             index=X.columns)
            if model_num != 2:
                print('Feat. Imp.: ', feat_importances)
                feat_importances.nlargest(20).plot(kind='barh')
                #plot_importance(model)
                plt.show()

                #write the feature importance values to the file
                file = open(dir_name + str(i + 1) + '/feature_importances.csv',
                            'a')
                for ind in range(0, len(feature_set)):
                    file.write(feature_set[ind] + ',' +
                               str(feat_importances[ind]) + '\n')
                file.close()

            perm = PermutationImportance(model,
                                         random_state=1).fit(X_train, y_train)
            print('PERM: ', perm.feature_importances_)
            display(
                eli5.show_weights(perm,
                                  feature_names=X_train.columns.tolist()))

            #write the permutation feature importance decrease in error values to the file
            file = open(
                dir_name + str(i + 1) + '/permutation_feature_importances.csv',
                'a')
            for ind in range(0, len(feature_set)):
                file.write(feature_set[ind] + ',' +
                           str(perm.feature_importances_[ind]) + '\n')
            file.write('\n')
            file.close()

        i += 1
    for key, values in metrics_dict.items():
        metrics_dict[key] = sum(values) / len(values)

    #write the scores to the file
    json.dump(metrics_dict, metrics_file)
    metrics_file.close()

    #write the configuration values to the file
    json.dump(data_dict, config_file)
    config_file.close()