def evaluate_classifier(featxs, datasets): posfeats, negfeats = feature_extraction(featxs, datasets) print '\ncross validation MLP' print cross_validation(posfeats, negfeats, folds=5, classifier='decision_tree')
def evaluate_classifier(featxs, datasets): posfeats, negfeats = feature_extraction(featxs, datasets) print '\ncross validation KNN' print cross_validation(posfeats, negfeats, folds=5, classifier='k_neighbors')
def evaluate_classifier(featxs, datasets): posfeats, negfeats = feature_extraction(featxs, datasets, punctuation=False) print '\ncross validation NB' print cross_validation(posfeats, negfeats, folds=5, classifier='naive_bayes')
def evaluation_function(y, x): """Evaluation function accepts data samples and is expected to return a number as an output. Parameters ---------- y : np.ndarray Labels x : np.ndarray Features """ accs, f_scores = validation.cross_validation(y, x, train_model, number_of_folds) return cross_validation_score_reducer(accs, f_scores)
reg = 0.0001 def single_train(data, label): clf = SGDClassifier(loss='hinge', penalty='l2', alpha=reg) return clf.fit(data, label) def train_both(data, label): clf1 = SGDClassifier(loss='hinge', penalty='l2', alpha=reg) clf2 = SGDClassifier(loss='hinge', penalty='l2', alpha=reg) return clf1.fit(data, label[:, 0]), clf2.fit(data, label[:, 1]) if __name__ == '__main__': total_data = loadFile.file2mat_bag_of_wordvec('./data/final_review_set.csv') shuffled_data = vld.data_reshuffle(total_data) # rbf_feature = RBFSampler(gamma=10) # train_mat = rbf_feature.fit_transform(shuffled_data[0]) train_mat = shuffled_data[0] aspect_label = shuffled_data[1] rating_label = shuffled_data[2] label_mat = np.vstack((aspect_label, rating_label)).T single_label = aspect_label * len(loadFile.aspect_dic) + rating_label print vld.cross_validation(train_mat, aspect_label, single_train, vld.test_single) print vld.cross_validation(train_mat, rating_label, single_train, vld.test_single) print vld.cross_validation(train_mat, single_label, single_train, vld.test_single) print vld.cross_validation(train_mat, single_label, single_train, vld.test_aspect) print vld.cross_validation(train_mat, single_label, single_train, vld.test_rating) print vld.cross_validation(train_mat, label_mat, train_both, vld.test_mat)
def main(): ###################### # Prepare filesystem # ###################### directory_exists(models_path) mkdir(results_path) ################### # Load embeddings # ################### embeddings_file = glob.glob(os.path.join(models_path, '*.mat'))[0] model_name = os.path.splitext(os.path.basename(embeddings_file))[0] print(model_name) stdout('Loading embeddings', embeddings_file) embeddings = load_embeddings(embeddings_file) embeddings = minmax_scale(embeddings) ####################### # Load GO annotations # ####################### annotation_dir = os.path.join(data_path, 'annotations') if validation == 'cerevisiae': annotation_file = os.path.join(annotation_dir, 'cerevisiae_annotations.mat') else: annotation_file = os.path.join(annotation_dir, 'yeast_annotations.mat') stdout('Loading GO annotations', annotation_file) GO = sio.loadmat(annotation_file) #################### # Train classifier # #################### stdout('Running cross-validation for', level) annotations = GO[level] # Silence certain warning messages during cross-validation for w in (sklearn.exceptions.UndefinedMetricWarning, UserWarning, RuntimeWarning): warnings.filterwarnings("ignore", category=w) # Only use a subset of the data for testing purposes embeddings = embeddings[:test] annotations = annotations[:test] # performance = cross_validation( # embeddings, # annotations, # n_trials=n_trials, # n_jobs=n_jobs, # n_threads=n_threads, # random_state=random_state, # clf_type=clf_type, # max_depth=max_depth[level]) performance = cross_validation(embeddings, annotations, n_trials=n_trials) performance['my_level'] = level pprint(performance) fout = f'{model_name}_{level}_{clf_type}_performance.json' with open(os.path.join(results_path, fout), 'w') as f: json.dump(performance, f)
""" # Author: Jael Zela <*****@*****.**> from feature_extraction import feature_extraction, bag_of_words, bigram_feats, tf_idf, part_of_speech from validation import cross_validation from datasets import g2crowd if __name__ == "__main__": posfeats, negfeats = feature_extraction([tf_idf], [g2crowd], stopwords=False, punctuation=False) print '\ncross validation NB' print cross_validation(posfeats, negfeats, folds=5, classifier='naive_bayes') print '\ncross validation SVM' print cross_validation(posfeats, negfeats, folds=5, classifier='svm') print '\ncross validation ME' print cross_validation(posfeats, negfeats, folds=5, classifier='maximum_entropy') #print '\ncross validation DT' #print cross_validation(posfeats, negfeats, folds=5, classifier='decision_tree') #print '\ncross validation RF' #print cross_validation(posfeats, negfeats, folds=5, classifier='random_forest') #print '\ncross validation MLP' #print cross_validation(posfeats, negfeats, folds=5, classifier='mlp_nn') #print '\ncross validation KNN'
def evaluate_classifier(featxs, datasets): posfeats, negfeats = feature_extraction(featxs, datasets) print '\ncross validation SVM' print cross_validation(posfeats, negfeats, folds=5, classifier='svm')
X_stem = stemmed_df["cleaned"] y_stem = stemmed_df["label"] X_lemma = lemmatized_df["cleaned"] y_lemma = lemmatized_df["label"] # Model log_reg = LogisticRegression(C=4.5, penalty="l2", multi_class='ovr', solver='liblinear', max_iter=300, dual=False, warm_start=True, fit_intercept=0.4) # Model parameters params_log_reg = { 'clf__max_iter': (150, 250, 350), 'clf__intercept_scaling': (0.3, 0.4, 0.5) # 'clf__multi_class': ('ovr', 'multinomial'), # one vs all or multinomial, hence one vs all is better for logistic regression # 'clf__solver': ('newton-cg', 'sag', 'lbfgs','saga'), } # Number of cross validation folds folds = 5 # Perform cross validation print(cross_validation(model=log_reg, X=X_stem, y=y_stem, folds=folds)) # Perform Grid Search CV # print(grid_search_cv(model=log_reg,X=X_stem, y=y_stem,params=params_log_reg, folds=folds))
def evaluate_classifier(featxs, datasets): posfeats, negfeats = feature_extraction(featxs, datasets) print '\ncross validation RF' print cross_validation(posfeats, negfeats, folds=5, classifier='random_forest')
# Read data df = pd.read_csv("preprocessed_reddit_train_SnowballStemmer.csv") y_train = df["label"] X_train = df["cleaned"] # Create Ada Boosting classifier clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=4), n_estimators=1000, algorithm="SAMME.R", learning_rate=0.1) # Model parameters params = { 'clf__n_estimators': (50, 100, 200), 'clf__learning_rate': (0.5, 1.0, 1.5), 'clf__algorithm': ("SAMME", "SAMME.R"), } # Number of cross validation folds folds = 2 """ Results AdaBoost(base_estimator=DecisionTreeClassifier(max_depth=4), n_estimators=1000, algorithm="SAMME.R", learning_rate=0.1) """ # Perform Cross-Validation to validate model print(cross_validation(model=clf, X=X_train, y=y_train, folds=folds)) # Perform Grid Search CV to find the best parameters # best_scores, best_params, best_estimator_params = grid_search_cv(model=clf, X=X_train, y=y_train, params=params, folds=folds)
def main(annot_fname, ont, model_name, data_folder, tax_ids, alpha, test_goid_fname, test_annot_fname=None, results_path='./results/test_results', block_matrix_folder='block_matrix_files/', network_folder='network_files/', use_orig_feats=False, use_nn=False, num_hyperparam_sets=None, arch_set=None, n_trials=5, save_only=False, load_fname=None, isorank_diag=False, subsample=False, lm_feat_path=None, lm_only=False): if load_fname is None: if test_annot_fname is None: X, Y, aligned_net_prots, test_goids = process_and_align_matrices(annot_fname, ont, model_name, data_folder, tax_ids, alpha, test_goid_fname, results_path=results_path, block_matrix_folder=block_matrix_folder, network_folder=network_folder, use_orig_feats=use_orig_feats, use_nn=use_nn, test_annot_fname=test_annot_fname, isorank_diag=isorank_diag, lm_feat_path=lm_feat_path, lm_only=lm_only) else: (X_rest, Y_rest, rest_prot_names, test_goids, X_test_species, Y_test_species, test_species_aligned_net_prots, all_string_prots) = process_and_align_matrices(annot_fname, ont, model_name, data_folder, tax_ids, alpha, test_goid_fname, results_path=results_path, block_matrix_folder=block_matrix_folder, network_folder=network_folder, use_orig_feats=use_orig_feats, use_nn=use_nn, test_annot_fname=test_annot_fname, isorank_diag=isorank_diag, lm_feat_path=lm_feat_path, lm_only=lm_only) else: load_file = pickle.load(open(load_fname, 'rb')) if test_annot_fname is None: X = load_file['X'] Y = load_file['y'] aligned_net_prots = load_file['prot_names'] test_goids = load_file['test_goids'] else: X_rest = load_file['X_rest'] X_test_species = load_file['X_test_species'] Y_rest = load_file['y_rest'] Y_test_species = load_file['y_test_species'] rest_prot_names = load_file['rest_prot_names'] test_species_aligned_net_prots = load_file['test_species_prots'] test_goids = load_file['test_goids'] #print("Saving X and Y matrices") # TODO so I can use a DataGenerator in order to train the maxout nns without loading whole dataset in memory # But honestly, not that bad for now ''' trial_file = {} if test_annot_fname is None: trial_file['X'] = X trial_file['Y'] = Y trial_file['aligned_net_prots'] = aligned_net_prots trial_file['test_goids'] = test_goids pickle.dump(trial_file, open('./train_test_data/' + model_name + '_' + ont + '_train_test_data_file.pckl', 'wb'), protocol=4) else: trial_file['X_rest'] = X_rest trial_file['Y_rest'] = Y_rest trial_file['rest_prot_names'] = rest_prot_names trial_file['test_goids'] = test_goids trial_file['X_test_species'] = X_test_species trial_file['Y_test_species'] = Y_test_species trial_file['test_species_aligned_net_prots'] = test_species_aligned_net_prots pickle.dump(trial_file, open('./train_test_data/' + model_name + '_' + ont + '_one_spec_train_test_data_file.pckl', 'wb'), protocol=4) print(test_goids) exit() ''' #output_projection_files(X, Y, model_name, ont, list(test_goids)) # 5 fold cross val if use_nn: if test_annot_fname is not None: perf, y_score_trials, pred_file = one_spec_cross_val(X_test_species, Y_test_species, test_species_aligned_net_prots, X_rest, Y_rest, rest_prot_names, test_goids, model_name, ont, n_trials=n_trials, num_hyperparam_sets=num_hyperparam_sets, arch_set=arch_set, save_only=save_only, subsample=subsample) pickle.dump(pred_file, open(results_path + model_name + '_one_spec_cv_use_nn_' + ont + '_pred_file.pckl', 'wb')) else: perf, y_score_trials, pred_file = cross_validation_nn(X, Y, aligned_net_prots, test_goids, model_name, ont, n_trials=n_trials, num_hyperparam_sets=num_hyperparam_sets, arch_set=arch_set) pickle.dump(pred_file, open(results_path + model_name + '_cv_use_nn_' + ont + '_pred_file.pckl', 'wb')) else: perf, y_score_trials, y_score_pred = cross_validation(X, Y, n_trials=5, X_pred=None) print('aupr[micro], aupr[macro], F_max, accuracy\n') avg_micro = 0.0 for ii in range(0, len(perf['F1'])): print('%0.5f %0.5f %0.5f %0.5f' % (perf['pr_micro'][ii], perf['pr_macro'][ii], perf['F1'][ii], perf['acc'][ii])) avg_micro += perf['pr_micro'][ii] avg_micro /= len(perf['F1']) print ("### Average (over trials): m-AUPR = %0.3f" % (avg_micro)) print if use_nn: val_type = 'nn' else: val_type = 'svm' pickle.dump(y_score_trials, open(results_path + model_name + "_goterm_" + ont + '_' + val_type + "_perf.pckl", "wb"))
constraints_ml[idx] = np.floor(temp_mustlink / (FLAGS.net_nums - 1)) constraints_cl[idx] = np.floor(temp_cannotlink / (FLAGS.net_nums - 1)) print( len(constraints_ml[idx].nonzero()[0]) / 2, len(constraints_cl[idx].nonzero()[0]) / 2) input_dim = FLAGS.hidden_dim[idx_layer] yeast_fusions = emb # output embedding str_nets = [ 'coexpression', 'cooccurence', 'database', 'experimental', 'fusion', 'neighborhood' ] for idxx in range(FLAGS.net_nums): temp_path = './emb/' + FLAGS.org + '_' + str_nets[ idxx] + '_' + FLAGS.optimizer + '_' + str( FLAGS.learning_rate[0]) + '_new.txt' write_encoded_file(emb[idxx], temp_path) perf = cross_validation(emb, labels) print( "Average (over trials) of DeepMNE: m-AUPR = %0.3f, M-AUPR = %0.3f, F1 = %0.3f, Acc = %0.3f" % (np.mean(perf['pr_micro']), np.mean( perf['pr_macro']), np.mean(perf['fmax']), np.mean(perf['acc']))) print print(FLAGS.layers_num, FLAGS.optimizer, FLAGS.learning_rate, FLAGS.batch_size)
def train_both(data, label): clf1 = MultinomialNB() clf2 = MultinomialNB() return clf1.fit(data, label[:, 0]), clf2.fit(data, label[:, 1]) if __name__ == '__main__': total_data = loadFile.file2mat('./data/final_review_set.csv') shuffled_data = vld.data_reshuffle(total_data) train_mat = shuffled_data[0] aspect_label = shuffled_data[1] rating_label = shuffled_data[2] label_mat = np.vstack((aspect_label, rating_label)).T single_label = aspect_label * len(loadFile.aspect_dic) + rating_label print "SAS, aspect:\t", vld.cross_validation(train_mat, aspect_label, single_train, vld.test_single) print "SAS, rating:\t", vld.cross_validation(train_mat, rating_label, single_train, vld.test_single) print "SAS, both:\t", vld.cross_validation(train_mat, single_label, single_train, vld.test_single) print "JMAS, aspect:\t", vld.cross_validation(train_mat, single_label, single_train, vld.test_aspect) print "JMAS, rating:\t", vld.cross_validation(train_mat, single_label, single_train, vld.test_rating) print "JMAS, both:\t", vld.cross_validation(train_mat, label_mat, train_both, vld.test_mat)
max_iter=2000, C=1.1, tol=0.00005) svc_clf = SVC(probability=True, kernel="linear", decision_function_shape="ovr", max_iter=2000, C=1.1, tol=0.00005) # Voting Classifier clf = VotingClassifier(estimators=[('lr', log_reg), ("nb", multi_NB), ("svc", svc_clf)], voting="soft") clf2 = VotingClassifier(estimators=[('lr', log_reg), ("nb", multi_NB), ("svc", linear_svc)], voting="hard") clf3 = VotingClassifier(estimators=[('lr', log_reg), ("nb", multi_NB)], voting="soft") # Number of cross validation folds folds = 5 # Perform cross validation print(cross_validation(model=clf, X=X_stem, y=y_stem, folds=folds)) # Perform Grid Search CV # print(grid_search_cv(model=log_reg,X=X_stem, y=y_stem,params=params_log_reg, folds=folds)) # Predict on test set # classify(clf)
Stemmed Lemmatized """ # Read DataFrame stemmed_df = pd.read_csv("preprocessed_reddit_train_SnowballStemmer.csv") # lemmatized_df = pd.read_csv("preprocessed_reddit_train_WordNetLemmatizer.csv") # Separate X and Y X_stem = stemmed_df["cleaned"] y_stem = stemmed_df["label"] # X_lemma = lemmatized_df["cleaned"] # y_lemma = lemmatized_df["label"] # Estimators multi_NB = MultinomialNB(alpha=0.225) # Model parameters params = { 'clf__alpha': (0.225, 0.25, 0.275), } # Number of folds for Cross Validation # Perform Cross-Validation to validate model print(cross_validation(model=multi_NB, X=X_stem, y=y_stem, folds=folds)) # Perform Grid Search CV to find the best parameters # best_scores, best_params, best_estimator_params = grid_search_cv(model=multi_NB, X=X_stem, y=y_stem, params=params, folds=5)
if Path.isfile(models_path + model_name): mid_model = load_model(models_path + model_name) else: print( "### Model % s does not exist. Check the 'models_path' directory.\n" % (model_name)) break mid_model = load_model(models_path + model_name) features = mid_model.predict(Nets) features = minmax_scale(features) sio.savemat(models_path + model_name.split('.')[0] + '_features.mat', {'features': features}) for level in annot: print("### Running for level: %s" % (level)) if valid_type == 'cv': perf = cross_validation(features, GO[level], n_trials=n_trials) else: perf = temporal_holdout(features, Annot['GO'][level].tolist(), Annot['indx'][level].tolist(), Annot['labels'][level].tolist()) fout.write('### %s goterms:\n' % (level)) fout.write('GO_id, AUPRs\n') for goid in perf['pr_goterms']: fout.write('%s' % (goid)) for pr in perf['pr_goterms'][goid]: fout.write(' %0.5f' % (pr)) fout.write('\n') fout.write('\n') fout.write('### %s trials:\n' % (level)) fout.write('aupr[micro], aupr[macro], F_max, accuracy\n') avg_micro = 0.0
import loadFile import numpy as np reg = 0.0001 def single_train(data, label): clf = SGDClassifier(loss='hinge', penalty='l2', alpha=reg) return clf.fit(data, label) def train_both(data, label): clf1 = SGDClassifier(loss='hinge', penalty='l2', alpha=reg) clf2 = SGDClassifier(loss='hinge', penalty='l2', alpha=reg) return clf1.fit(data, label[:, 0]), clf2.fit(data, label[:, 1]) if __name__ == '__main__': total_data = loadFile.file2mat('./data/final_review_set.csv') shuffled_data = vld.data_reshuffle(total_data) train_mat = shuffled_data[0] aspect_label = shuffled_data[1] rating_label = shuffled_data[2] label_mat = np.vstack((aspect_label, rating_label)).T single_label = aspect_label * len(loadFile.aspect_dic) + rating_label print "SAS, aspect:\t", vld.cross_validation(train_mat, aspect_label, single_train, vld.test_single) print "SAS, rating:\t", vld.cross_validation(train_mat, rating_label, single_train, vld.test_single) print "SAS, both:\t", vld.cross_validation(train_mat, single_label, single_train, vld.test_single) print "JMAS, aspect:\t", vld.cross_validation(train_mat, single_label, single_train, vld.test_aspect) print "JMAS, rating:\t", vld.cross_validation(train_mat, single_label, single_train, vld.test_rating) print "JMAS, both:\t", vld.cross_validation(train_mat, label_mat, train_both, vld.test_mat)
def train_both(data, label): clf1 = SGDClassifier(loss='hinge', penalty='l2', alpha=reg) clf2 = SGDClassifier(loss='hinge', penalty='l2', alpha=reg) return clf1.fit(data, label[:, 0]), clf2.fit(data, label[:, 1]) if __name__ == '__main__': total_data = loadFile.file2mat_bag_of_wordvec( './data/final_review_set.csv') shuffled_data = vld.data_reshuffle(total_data) # rbf_feature = RBFSampler(gamma=10) # train_mat = rbf_feature.fit_transform(shuffled_data[0]) train_mat = shuffled_data[0] aspect_label = shuffled_data[1] rating_label = shuffled_data[2] label_mat = np.vstack((aspect_label, rating_label)).T single_label = aspect_label * len(loadFile.aspect_dic) + rating_label print vld.cross_validation(train_mat, aspect_label, single_train, vld.test_single) print vld.cross_validation(train_mat, rating_label, single_train, vld.test_single) print vld.cross_validation(train_mat, single_label, single_train, vld.test_single) print vld.cross_validation(train_mat, single_label, single_train, vld.test_aspect) print vld.cross_validation(train_mat, single_label, single_train, vld.test_rating) print vld.cross_validation(train_mat, label_mat, train_both, vld.test_mat)