def evaluate_classifier(featxs, datasets):
    posfeats, negfeats = feature_extraction(featxs, datasets)

    print '\ncross validation MLP'
    print cross_validation(posfeats,
                           negfeats,
                           folds=5,
                           classifier='decision_tree')
def evaluate_classifier(featxs, datasets):

    posfeats, negfeats = feature_extraction(featxs, datasets)

    print '\ncross validation KNN'
    print cross_validation(posfeats,
                           negfeats,
                           folds=5,
                           classifier='k_neighbors')
def evaluate_classifier(featxs, datasets):

    posfeats, negfeats = feature_extraction(featxs,
                                            datasets,
                                            punctuation=False)

    print '\ncross validation NB'
    print cross_validation(posfeats,
                           negfeats,
                           folds=5,
                           classifier='naive_bayes')
Ejemplo n.º 4
0
    def evaluation_function(y, x):
        """Evaluation function accepts data samples and is expected to return a number as an output.

        Parameters
        ----------
        y : np.ndarray
            Labels
        x : np.ndarray
            Features
        """
        accs, f_scores = validation.cross_validation(y, x, train_model,
                                                     number_of_folds)
        return cross_validation_score_reducer(accs, f_scores)
Ejemplo n.º 5
0
reg = 0.0001


def single_train(data, label):
    clf = SGDClassifier(loss='hinge', penalty='l2', alpha=reg)
    return clf.fit(data, label)


def train_both(data, label):
    clf1 = SGDClassifier(loss='hinge', penalty='l2', alpha=reg)
    clf2 = SGDClassifier(loss='hinge', penalty='l2', alpha=reg)
    return clf1.fit(data, label[:, 0]), clf2.fit(data, label[:, 1])

if __name__ == '__main__':
    total_data = loadFile.file2mat_bag_of_wordvec('./data/final_review_set.csv')
    shuffled_data = vld.data_reshuffle(total_data)
    # rbf_feature = RBFSampler(gamma=10)
    # train_mat = rbf_feature.fit_transform(shuffled_data[0])
    train_mat = shuffled_data[0]
    aspect_label = shuffled_data[1]
    rating_label = shuffled_data[2]
    label_mat = np.vstack((aspect_label, rating_label)).T
    single_label = aspect_label * len(loadFile.aspect_dic) + rating_label
    print vld.cross_validation(train_mat, aspect_label, single_train, vld.test_single)
    print vld.cross_validation(train_mat, rating_label, single_train, vld.test_single)
    print vld.cross_validation(train_mat, single_label, single_train, vld.test_single)
    print vld.cross_validation(train_mat, single_label, single_train, vld.test_aspect)
    print vld.cross_validation(train_mat, single_label, single_train, vld.test_rating)
    print vld.cross_validation(train_mat, label_mat, train_both, vld.test_mat)
Ejemplo n.º 6
0
def main():
    ######################
    # Prepare filesystem #
    ######################

    directory_exists(models_path)
    mkdir(results_path)

    ###################
    # Load embeddings #
    ###################

    embeddings_file = glob.glob(os.path.join(models_path, '*.mat'))[0]
    model_name = os.path.splitext(os.path.basename(embeddings_file))[0]
    print(model_name)
    stdout('Loading embeddings', embeddings_file)
    embeddings = load_embeddings(embeddings_file)
    embeddings = minmax_scale(embeddings)

    #######################
    # Load GO annotations #
    #######################

    annotation_dir = os.path.join(data_path, 'annotations')
    if validation == 'cerevisiae':
        annotation_file = os.path.join(annotation_dir,
                                       'cerevisiae_annotations.mat')
    else:
        annotation_file = os.path.join(annotation_dir, 'yeast_annotations.mat')
    stdout('Loading GO annotations', annotation_file)

    GO = sio.loadmat(annotation_file)

    ####################
    # Train classifier #
    ####################

    stdout('Running cross-validation for', level)

    annotations = GO[level]

    # Silence certain warning messages during cross-validation
    for w in (sklearn.exceptions.UndefinedMetricWarning, UserWarning,
              RuntimeWarning):
        warnings.filterwarnings("ignore", category=w)

    # Only use a subset of the data for testing purposes
    embeddings = embeddings[:test]
    annotations = annotations[:test]

    # performance = cross_validation(
    #     embeddings,
    #     annotations,
    #     n_trials=n_trials,
    #     n_jobs=n_jobs,
    #     n_threads=n_threads,
    #     random_state=random_state,
    #     clf_type=clf_type,
    #     max_depth=max_depth[level])

    performance = cross_validation(embeddings, annotations, n_trials=n_trials)

    performance['my_level'] = level

    pprint(performance)

    fout = f'{model_name}_{level}_{clf_type}_performance.json'

    with open(os.path.join(results_path, fout), 'w') as f:
        json.dump(performance, f)
Ejemplo n.º 7
0
"""

# Author: Jael Zela <*****@*****.**>

from feature_extraction import feature_extraction, bag_of_words, bigram_feats, tf_idf, part_of_speech
from validation import cross_validation
from datasets import g2crowd

if __name__ == "__main__":
    posfeats, negfeats = feature_extraction([tf_idf], [g2crowd],
                                            stopwords=False,
                                            punctuation=False)

    print '\ncross validation NB'
    print cross_validation(posfeats,
                           negfeats,
                           folds=5,
                           classifier='naive_bayes')
    print '\ncross validation SVM'
    print cross_validation(posfeats, negfeats, folds=5, classifier='svm')
    print '\ncross validation ME'
    print cross_validation(posfeats,
                           negfeats,
                           folds=5,
                           classifier='maximum_entropy')
    #print '\ncross validation DT'
    #print cross_validation(posfeats, negfeats, folds=5, classifier='decision_tree')
    #print '\ncross validation RF'
    #print cross_validation(posfeats, negfeats, folds=5, classifier='random_forest')
    #print '\ncross validation MLP'
    #print cross_validation(posfeats, negfeats, folds=5, classifier='mlp_nn')
    #print '\ncross validation KNN'
Ejemplo n.º 8
0
def evaluate_classifier(featxs, datasets):

    posfeats, negfeats = feature_extraction(featxs, datasets)

    print '\ncross validation SVM'
    print cross_validation(posfeats, negfeats, folds=5, classifier='svm')
Ejemplo n.º 9
0
X_stem = stemmed_df["cleaned"]
y_stem = stemmed_df["label"]
X_lemma = lemmatized_df["cleaned"]
y_lemma = lemmatized_df["label"]

# Model
log_reg = LogisticRegression(C=4.5,
                             penalty="l2",
                             multi_class='ovr',
                             solver='liblinear',
                             max_iter=300,
                             dual=False,
                             warm_start=True,
                             fit_intercept=0.4)

# Model parameters
params_log_reg = {
    'clf__max_iter': (150, 250, 350),
    'clf__intercept_scaling': (0.3, 0.4, 0.5)
    # 'clf__multi_class': ('ovr', 'multinomial'), # one vs all or multinomial, hence one vs all is better for logistic regression
    # 'clf__solver': ('newton-cg', 'sag', 'lbfgs','saga'),
}

# Number of cross validation folds
folds = 5

# Perform cross validation
print(cross_validation(model=log_reg, X=X_stem, y=y_stem, folds=folds))

# Perform Grid Search CV
# print(grid_search_cv(model=log_reg,X=X_stem, y=y_stem,params=params_log_reg, folds=folds))
Ejemplo n.º 10
0
def evaluate_classifier(featxs, datasets):

    posfeats, negfeats = feature_extraction(featxs, datasets)

    print '\ncross validation RF'
    print cross_validation(posfeats, negfeats, folds=5, classifier='random_forest')
Ejemplo n.º 11
0
# Read data
df = pd.read_csv("preprocessed_reddit_train_SnowballStemmer.csv")
y_train = df["label"]
X_train = df["cleaned"]

# Create Ada Boosting classifier
clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=4),
                         n_estimators=1000,
                         algorithm="SAMME.R",
                         learning_rate=0.1)

# Model parameters
params = {
    'clf__n_estimators': (50, 100, 200),
    'clf__learning_rate': (0.5, 1.0, 1.5),
    'clf__algorithm': ("SAMME", "SAMME.R"),
}

# Number of cross validation folds
folds = 2
""" 
Results 
AdaBoost(base_estimator=DecisionTreeClassifier(max_depth=4), n_estimators=1000, algorithm="SAMME.R", learning_rate=0.1)

"""

# Perform Cross-Validation to validate model
print(cross_validation(model=clf, X=X_train, y=y_train, folds=folds))

# Perform Grid Search CV to find the best parameters
# best_scores, best_params, best_estimator_params = grid_search_cv(model=clf, X=X_train, y=y_train, params=params, folds=folds)
Ejemplo n.º 12
0
def main(annot_fname, ont, model_name, data_folder, tax_ids, alpha, test_goid_fname, test_annot_fname=None, 
        results_path='./results/test_results', block_matrix_folder='block_matrix_files/', 
        network_folder='network_files/', use_orig_feats=False, use_nn=False, 
        num_hyperparam_sets=None, arch_set=None, n_trials=5, save_only=False, load_fname=None,
        isorank_diag=False, subsample=False, lm_feat_path=None, lm_only=False):
    if load_fname is None:
        if test_annot_fname is None:
            X, Y, aligned_net_prots, test_goids = process_and_align_matrices(annot_fname,
                ont, model_name, data_folder, tax_ids, alpha, test_goid_fname, 
                results_path=results_path, block_matrix_folder=block_matrix_folder, 
                network_folder=network_folder, use_orig_feats=use_orig_feats, 
                use_nn=use_nn, test_annot_fname=test_annot_fname, isorank_diag=isorank_diag, lm_feat_path=lm_feat_path, lm_only=lm_only)
        else:
            (X_rest, Y_rest, rest_prot_names, test_goids, X_test_species, Y_test_species, 
                test_species_aligned_net_prots, all_string_prots) = process_and_align_matrices(annot_fname,
                ont, model_name, data_folder, tax_ids, alpha, test_goid_fname, 
                results_path=results_path, block_matrix_folder=block_matrix_folder, 
                network_folder=network_folder, use_orig_feats=use_orig_feats, 
                use_nn=use_nn, test_annot_fname=test_annot_fname, isorank_diag=isorank_diag, lm_feat_path=lm_feat_path, lm_only=lm_only)
    else:
        load_file = pickle.load(open(load_fname, 'rb')) 
        if test_annot_fname is None:
            X = load_file['X']
            Y = load_file['y']
            aligned_net_prots = load_file['prot_names']
            test_goids = load_file['test_goids']
        else:
            X_rest = load_file['X_rest']
            X_test_species = load_file['X_test_species']
            Y_rest = load_file['y_rest']
            Y_test_species = load_file['y_test_species']
            rest_prot_names = load_file['rest_prot_names']
            test_species_aligned_net_prots = load_file['test_species_prots']
            test_goids = load_file['test_goids']

    #print("Saving X and Y matrices") # TODO so I can use a DataGenerator in order to train the maxout nns without loading whole dataset in memory
    # But honestly, not that bad for now
    '''
    trial_file = {}
    if test_annot_fname is None:
        trial_file['X'] = X
        trial_file['Y'] = Y
        trial_file['aligned_net_prots'] = aligned_net_prots
        trial_file['test_goids'] = test_goids
        pickle.dump(trial_file, open('./train_test_data/' + model_name + '_' + ont + '_train_test_data_file.pckl', 'wb'), protocol=4)
    else:
        trial_file['X_rest'] = X_rest
        trial_file['Y_rest'] = Y_rest
        trial_file['rest_prot_names'] = rest_prot_names
        trial_file['test_goids'] = test_goids
        trial_file['X_test_species'] = X_test_species
        trial_file['Y_test_species'] = Y_test_species
        trial_file['test_species_aligned_net_prots'] = test_species_aligned_net_prots
        pickle.dump(trial_file, open('./train_test_data/' + model_name + '_' + ont + '_one_spec_train_test_data_file.pckl', 'wb'), protocol=4)
    print(test_goids)
    exit()
    '''

    #output_projection_files(X, Y, model_name, ont, list(test_goids))
    # 5 fold cross val
    if use_nn:
        if test_annot_fname is not None:
            perf, y_score_trials, pred_file = one_spec_cross_val(X_test_species, 
                    Y_test_species, test_species_aligned_net_prots, X_rest, Y_rest,
                    rest_prot_names, test_goids, model_name, ont, n_trials=n_trials,
                    num_hyperparam_sets=num_hyperparam_sets, arch_set=arch_set, save_only=save_only,
                    subsample=subsample)
            pickle.dump(pred_file, open(results_path
                    + model_name + '_one_spec_cv_use_nn_' 
                    + ont + '_pred_file.pckl', 'wb'))
        else:
            perf, y_score_trials, pred_file = cross_validation_nn(X, Y, 
                aligned_net_prots, test_goids, model_name, ont, n_trials=n_trials, 
                num_hyperparam_sets=num_hyperparam_sets, arch_set=arch_set)
            pickle.dump(pred_file, open(results_path
                    + model_name + '_cv_use_nn_' 
                    + ont + '_pred_file.pckl', 'wb'))
    else:
        perf, y_score_trials, y_score_pred = cross_validation(X, Y, 
                n_trials=5, X_pred=None)

    print('aupr[micro], aupr[macro], F_max, accuracy\n')
    avg_micro = 0.0
    for ii in range(0, len(perf['F1'])):
        print('%0.5f %0.5f %0.5f %0.5f' 
                % (perf['pr_micro'][ii], perf['pr_macro'][ii], perf['F1'][ii], perf['acc'][ii]))
        avg_micro += perf['pr_micro'][ii]
    avg_micro /= len(perf['F1'])
    print ("### Average (over trials): m-AUPR = %0.3f" % (avg_micro))
    print
    if use_nn:
        val_type = 'nn'
    else:
        val_type = 'svm'
    pickle.dump(y_score_trials, 
            open(results_path + model_name 
                + "_goterm_" + ont + '_' + val_type + "_perf.pckl", "wb"))
Ejemplo n.º 13
0
            constraints_ml[idx] = np.floor(temp_mustlink /
                                           (FLAGS.net_nums - 1))
            constraints_cl[idx] = np.floor(temp_cannotlink /
                                           (FLAGS.net_nums - 1))
            print(
                len(constraints_ml[idx].nonzero()[0]) / 2,
                len(constraints_cl[idx].nonzero()[0]) / 2)

    input_dim = FLAGS.hidden_dim[idx_layer]
    yeast_fusions = emb

# output embedding
str_nets = [
    'coexpression', 'cooccurence', 'database', 'experimental', 'fusion',
    'neighborhood'
]
for idxx in range(FLAGS.net_nums):
    temp_path = './emb/' + FLAGS.org + '_' + str_nets[
        idxx] + '_' + FLAGS.optimizer + '_' + str(
            FLAGS.learning_rate[0]) + '_new.txt'
    write_encoded_file(emb[idxx], temp_path)

perf = cross_validation(emb, labels)

print(
    "Average (over trials) of DeepMNE: m-AUPR = %0.3f, M-AUPR = %0.3f, F1 = %0.3f, Acc = %0.3f"
    % (np.mean(perf['pr_micro']), np.mean(
        perf['pr_macro']), np.mean(perf['fmax']), np.mean(perf['acc'])))
print

print(FLAGS.layers_num, FLAGS.optimizer, FLAGS.learning_rate, FLAGS.batch_size)
Ejemplo n.º 14
0

def train_both(data, label):
    clf1 = MultinomialNB()
    clf2 = MultinomialNB()
    return clf1.fit(data, label[:, 0]), clf2.fit(data, label[:, 1])


if __name__ == '__main__':
    total_data = loadFile.file2mat('./data/final_review_set.csv')
    shuffled_data = vld.data_reshuffle(total_data)
    train_mat = shuffled_data[0]
    aspect_label = shuffled_data[1]
    rating_label = shuffled_data[2]
    label_mat = np.vstack((aspect_label, rating_label)).T
    single_label = aspect_label * len(loadFile.aspect_dic) + rating_label
    print "SAS, aspect:\t", vld.cross_validation(train_mat, aspect_label,
                                                 single_train, vld.test_single)
    print "SAS, rating:\t", vld.cross_validation(train_mat, rating_label,
                                                 single_train, vld.test_single)
    print "SAS, both:\t", vld.cross_validation(train_mat, single_label,
                                               single_train, vld.test_single)
    print "JMAS, aspect:\t", vld.cross_validation(train_mat, single_label,
                                                  single_train,
                                                  vld.test_aspect)
    print "JMAS, rating:\t", vld.cross_validation(train_mat, single_label,
                                                  single_train,
                                                  vld.test_rating)
    print "JMAS, both:\t", vld.cross_validation(train_mat, label_mat,
                                                train_both, vld.test_mat)
Ejemplo n.º 15
0
                       max_iter=2000,
                       C=1.1,
                       tol=0.00005)
svc_clf = SVC(probability=True,
              kernel="linear",
              decision_function_shape="ovr",
              max_iter=2000,
              C=1.1,
              tol=0.00005)

# Voting Classifier
clf = VotingClassifier(estimators=[('lr', log_reg), ("nb", multi_NB),
                                   ("svc", svc_clf)],
                       voting="soft")
clf2 = VotingClassifier(estimators=[('lr', log_reg), ("nb", multi_NB),
                                    ("svc", linear_svc)],
                        voting="hard")
clf3 = VotingClassifier(estimators=[('lr', log_reg), ("nb", multi_NB)],
                        voting="soft")

# Number of cross validation folds
folds = 5

# Perform cross validation
print(cross_validation(model=clf, X=X_stem, y=y_stem, folds=folds))

# Perform Grid Search CV
# print(grid_search_cv(model=log_reg,X=X_stem, y=y_stem,params=params_log_reg, folds=folds))

# Predict on test set
# classify(clf)
Ejemplo n.º 16
0
Stemmed 

Lemmatized 
"""

# Read DataFrame
stemmed_df = pd.read_csv("preprocessed_reddit_train_SnowballStemmer.csv")
# lemmatized_df = pd.read_csv("preprocessed_reddit_train_WordNetLemmatizer.csv")

# Separate X and Y
X_stem = stemmed_df["cleaned"]
y_stem = stemmed_df["label"]
# X_lemma = lemmatized_df["cleaned"]
# y_lemma = lemmatized_df["label"]

# Estimators
multi_NB = MultinomialNB(alpha=0.225)

# Model parameters
params = {
    'clf__alpha': (0.225, 0.25, 0.275),
}

# Number of folds for Cross Validation

# Perform Cross-Validation to validate model
print(cross_validation(model=multi_NB, X=X_stem, y=y_stem, folds=folds))

# Perform Grid Search CV to find the best parameters
# best_scores, best_params, best_estimator_params = grid_search_cv(model=multi_NB, X=X_stem, y=y_stem, params=params, folds=5)
Ejemplo n.º 17
0
 if Path.isfile(models_path + model_name):
     mid_model = load_model(models_path + model_name)
 else:
     print(
         "### Model % s does not exist. Check the 'models_path' directory.\n"
         % (model_name))
     break
 mid_model = load_model(models_path + model_name)
 features = mid_model.predict(Nets)
 features = minmax_scale(features)
 sio.savemat(models_path + model_name.split('.')[0] + '_features.mat',
             {'features': features})
 for level in annot:
     print("### Running for level: %s" % (level))
     if valid_type == 'cv':
         perf = cross_validation(features, GO[level], n_trials=n_trials)
     else:
         perf = temporal_holdout(features, Annot['GO'][level].tolist(),
                                 Annot['indx'][level].tolist(),
                                 Annot['labels'][level].tolist())
         fout.write('### %s goterms:\n' % (level))
         fout.write('GO_id, AUPRs\n')
         for goid in perf['pr_goterms']:
             fout.write('%s' % (goid))
             for pr in perf['pr_goterms'][goid]:
                 fout.write(' %0.5f' % (pr))
             fout.write('\n')
         fout.write('\n')
     fout.write('### %s trials:\n' % (level))
     fout.write('aupr[micro], aupr[macro], F_max, accuracy\n')
     avg_micro = 0.0
Ejemplo n.º 18
0
import loadFile
import numpy as np

reg = 0.0001


def single_train(data, label):
    clf = SGDClassifier(loss='hinge', penalty='l2', alpha=reg)
    return clf.fit(data, label)


def train_both(data, label):
    clf1 = SGDClassifier(loss='hinge', penalty='l2', alpha=reg)
    clf2 = SGDClassifier(loss='hinge', penalty='l2', alpha=reg)
    return clf1.fit(data, label[:, 0]), clf2.fit(data, label[:, 1])

if __name__ == '__main__':
    total_data = loadFile.file2mat('./data/final_review_set.csv')
    shuffled_data = vld.data_reshuffle(total_data)
    train_mat = shuffled_data[0]
    aspect_label = shuffled_data[1]
    rating_label = shuffled_data[2]
    label_mat = np.vstack((aspect_label, rating_label)).T
    single_label = aspect_label * len(loadFile.aspect_dic) + rating_label
    print "SAS, aspect:\t", vld.cross_validation(train_mat, aspect_label, single_train, vld.test_single)
    print "SAS, rating:\t", vld.cross_validation(train_mat, rating_label, single_train, vld.test_single)
    print "SAS, both:\t", vld.cross_validation(train_mat, single_label, single_train, vld.test_single)
    print "JMAS, aspect:\t", vld.cross_validation(train_mat, single_label, single_train, vld.test_aspect)
    print "JMAS, rating:\t", vld.cross_validation(train_mat, single_label, single_train, vld.test_rating)
    print "JMAS, both:\t", vld.cross_validation(train_mat, label_mat, train_both, vld.test_mat)
Ejemplo n.º 19
0

def train_both(data, label):
    clf1 = SGDClassifier(loss='hinge', penalty='l2', alpha=reg)
    clf2 = SGDClassifier(loss='hinge', penalty='l2', alpha=reg)
    return clf1.fit(data, label[:, 0]), clf2.fit(data, label[:, 1])


if __name__ == '__main__':
    total_data = loadFile.file2mat_bag_of_wordvec(
        './data/final_review_set.csv')
    shuffled_data = vld.data_reshuffle(total_data)
    # rbf_feature = RBFSampler(gamma=10)
    # train_mat = rbf_feature.fit_transform(shuffled_data[0])
    train_mat = shuffled_data[0]
    aspect_label = shuffled_data[1]
    rating_label = shuffled_data[2]
    label_mat = np.vstack((aspect_label, rating_label)).T
    single_label = aspect_label * len(loadFile.aspect_dic) + rating_label
    print vld.cross_validation(train_mat, aspect_label, single_train,
                               vld.test_single)
    print vld.cross_validation(train_mat, rating_label, single_train,
                               vld.test_single)
    print vld.cross_validation(train_mat, single_label, single_train,
                               vld.test_single)
    print vld.cross_validation(train_mat, single_label, single_train,
                               vld.test_aspect)
    print vld.cross_validation(train_mat, single_label, single_train,
                               vld.test_rating)
    print vld.cross_validation(train_mat, label_mat, train_both, vld.test_mat)