class StateWikiClassifier(): DATABASE = "us_twitter.db" def __init__(self): db_mgr = DataManager(self.DATABASE) self.train_tweets, self.train_labels = db_mgr.select_wikipedia_train() self.vectorizer = get_vectorizer("tfidf", min_df=1) self.nb = Classifier(classifier="nb") self.train_data = self.vectorizer.fit_transform(self.train_tweets) self.nb.fit(self.train_data, self.train_labels) def predict(self, text): text = text.lower() results = self.nb.predict(self.vectorizer.transform([text])) return results[0], FIPS_DEFINITIONS[results[0]]
def main(argv): image_path = '' clsf = 'knn' try: opts, args = getopt.getopt(argv, 'hi:c:', ['help=', 'image=', 'classifier=']) except getopt.GetoptError: print '\nusage: python sum_digits.py -i <image>\n' sys.exit(2) for opt, arg in opts: if opt in ('-h', '--help'): print '\nusage: python sum_digits.py -i <image>' print '\nOptions:' print '\t-h [--help], : Show help' print '\t-i [--image], : Path to the image' print '\t-c [--classifier], : Specify classifier. KNN (default) or CNN' sys.exit() elif opt in ('-i', '--image'): image_path = arg else: clsf = arg.lower() print '\n=> Initializing classifier ...' # Initialize classifier, default classifier is kNN classifier = Classifier(clsf) print '=> Calculating sum ...' print '\nSum: %.1f' % sum_digits(image_path, classifier)
class SurrogateModel(object): def __init__(self, fitness, configuration, controller): self.fitness = fitness self.configuration = configuration self.classifier = Classifier() self.regressor = Regressor(controller) def train(self, pop): raise NotImplementedError('SurrogateModel is an abstract class, this ' 'should not be called.') def model_particles(self, particles): MU, S2 = self.regressor.predict(particles) return self.classifier.predict(particles), MU, S2 def add_training_instance(self, part, code, fitness): pass def __getstate__(self): # Don't pickle fitness and configuration d = dict(self.__dict__) del d['fitness'] del d['configuration'] return d
def __init__(self): db_mgr = DataManager(self.DATABASE) self.train_tweets, self.train_labels = db_mgr.select_wikipedia_train() self.vectorizer = get_vectorizer("tfidf", min_df=1) self.nb = Classifier(classifier="nb") self.train_data = self.vectorizer.fit_transform(self.train_tweets) self.nb.fit(self.train_data, self.train_labels)
def __init__(self, sess, hidden_layers, hidden_units, num_perms, trials, epochs): self.hidden_layers = hidden_layers self.hidden_units = hidden_units self.num_perms = num_perms self.epochs = epochs self.task_list = self.create_permuted_mnist_task(num_perms) self.trial_learning_rates = [ PRNG.uniform(1e-4, 1e-3) for _ in range(0, trials) ] self.best_parameters = [] self.sess = sess self.classifier = Classifier( num_class=10, num_features=784, fc_hidden_units=[hidden_units for _ in range(hidden_layers)], apply_dropout=True)
def train(dataSetTrain, dataSetTest, dataSetTest2, dataSetTest3): # Start an Interactive session sess = tf.InteractiveSession() if (FLAGS.hidden3 == -1): classifier = Classifier(num_class=10, num_features=784, fc_hidden_units=[FLAGS.hidden1, FLAGS.hidden2], apply_dropout=True, checkpoint_path=FLAGS.checkpoints_dir) else: classifier = Classifier( num_class=10, num_features=784, fc_hidden_units=[FLAGS.hidden1, FLAGS.hidden2, FLAGS.hidden3], apply_dropout=True, checkpoint_path=FLAGS.checkpoints_dir) print('\nTraining on DataSet started...') print('____________________________________________________________') print(time.strftime('%X %x %Z')) print("Total updates: %s " % ((55000 // FLAGS.batch_size) * FLAGS.epochs)) testdatalist = [dataSetTest, dataSetTest2, dataSetTest3] if FLAGS.test2_classes == None and FLAGS.test3_classes == None: testdatalist = [dataSetTest] Classifier.train_mod( classifier, sess=sess, model_name=FLAGS.save_model if FLAGS.save_model != None else "", model_init_name=FLAGS.load_model, dataset=dataSetTrain, num_updates=(FLAGS.max_steps * FLAGS.batch_size * FLAGS.epochs // FLAGS.batch_size) * FLAGS.epochs, dataset_lagged=[0], mini_batch_size=FLAGS.batch_size, log_frequency=LOG_FREQUENCY, fisher_multiplier=1.0 / FLAGS.learning_rate, learning_rate=FLAGS.learning_rate, testing_data_sets=testdatalist, plot_files=[FLAGS.plot_file, FLAGS.plot_file2, FLAGS.plot_file3], start_at_step=FLAGS.start_at_step)
def kfold(orig_data, orig_labels, test_data, clf: classifiers.Classifier): results = [] predictions = [] best_k = 0 for k in range(len(orig_data)): print("Iteration", k + 1, "over", len(orig_data)) data, labels = orig_data[:], orig_labels[:] val_data = data.pop(k) val_labels = labels.pop(k) train_data = np.concatenate(data) train_labels = np.concatenate(labels) print("Fitting...") clf.fit(train_data, train_labels) print("Evaluating...") results.append(clf.evaluate(val_data, val_labels)) print(results[k]) print("Predicting...") predictions.append(clf.predict(test_data)) print(predictions[k]) if results[k]["Accuracy"] > results[best_k]["Accuracy"]: best_k = k return results[best_k], predictions[best_k]
def train_and_test(df, preds, seed): ''' Run a single trial: Shuffle df and split it into training and testing subsets Train a new model based on the training sets Test the model with testing set Add prediction data into preds array :param df: dataframe with full set of all available samples columns: id, cat1 (primary class), cat2 (secondary), title, titlen (claened title) :param preds: an array of predictions, each prediction is a dictionary cat: true category, pred: predicted category, conf: model confidence in its prediction (< 1.0), title: actual title of the chapter/sample :return: average testing accuracy ''' ret = {} # PREPS # randomly split the dataset df = utils.split_dataset( df, settings.CAT_DEPTH, settings.TRAIN_PER_CLASS_MIN, settings.TEST_PER_CLASS, settings.VALID_PER_CLASS, ) # TRAIN classifier = Classifier.from_name(settings.CLASSIFIER, seed) classifier.set_datasets(df, titles_out_path) classifier.train() df_test = classifier.df_test if settings.EVALUATE_TRAINING_SET: evaluate_model(classifier, classifier.df_train, display_prefix='TRAIN = ') accuracy = evaluate_model(classifier, df_test, preds, display_prefix='TEST = ') classifier_key = utils.get_exp_key(classifier) classifier.release_resources() return classifier_key, accuracy, classifier.df_train
def train_classification_models(X, y, ground_truth) -> None: """ :param X: :param y: :param ground_truth: :return: """ # Start recording time. start_time = time.time() # Create classifier model instance. Classifier(config.model, X, y, ground_truth) # Print training runtime. print_runtime(round(time.time() - start_time, 2))
class DummySurrogateModel(SurrogateModel): ## TODO - add dummy regressor/classifier def __init__(self, configuration, controller, fitness): super(DummySurrogateModel, self).__init__(configuration, controller, fitness) self.regressor = Regressor(controller, configuration) self.classifier = Classifier() def get_regressor(self): return self.regressor def get_classifier(self): return self.classifier def predict(self, particles): MU, S2 = self.regressor.predict(particles) return self.classifier.predict(particles), MU, S2 def train(self, hypercube): self.was_trained = True return True def model_particle(self, particle): return 0, 0, 0 def contains_training_instance(self, part): return False def model_failed(self, part): return False def get_state_dictionary(self): return {} def set_state_dictionary(self, dict): pass def get_copy(self): model_copy = DummySurrogateModel(self.configuration, self.controller) return model_copy
def prepare_dataset(): '''Convert input .txt o .csv into a .csv file with all the necessary columns for training and testing classification models.''' # # experimental work done on first, small dataset. # utils.extract_transcripts_from_pdfs() # utils.learn_embeddings_from_transcipts() # load titles file into dataframe df_all = pd.DataFrame() for fileinfo in settings.DATASET_FILES: if not (fileinfo['can_train'] or fileinfo['can_test']): continue titles_path = utils.get_data_path('in', fileinfo['filename']) if not os.path.exists(titles_path): utils.log_error( 'The training file ({0}) is missing. See README.md for more info.' .format(titles_path)) df = utils.read_df_from_titles(titles_path, use_full_text=settings.FULL_TEXT) for flag in ['can_train', 'can_test']: df[flag] = fileinfo[flag] df_all = df_all.append(df, ignore_index=True) # save that as a csv df_all.to_csv( titles_out_path, columns=['id', 'cat1', 'cat2', 'title', 'can_train', 'can_test'], index=False) # normalise the title classifier = Classifier.from_name(settings.CLASSIFIER, None) df_all['titlen'] = df_all['title'].apply(lambda v: classifier.tokenise(v)) classifier.release_resources() return df_all
def run_fold(data, i, results): """ Used in main function to run each fold :param data: the dictionary with all the configurations :param i: the fold number :param results: the dictionary containing the results for each classifier :return: the results dictionary """ print("Running fold: ", i) start_time = utils.get_datetime() fold = "fold" + str(i) print("Reading and converting arff files") # use a converter for arff files to get pandas DataFrames train_df, test_df = dfc.convert_arff_to_dataframe(data, fold) print("Got train and test dataframes") print("Creating numpy arrays") # separate labels from features and replace labels with numbers train_labels, train_features, train_labels_dict = dfc.get_features_labels_arrays( train_df) test_labels, test_features, test_labels_dict = dfc.get_features_labels_arrays( test_df) num_classes = len(train_labels_dict) print("Got labels and features for train and test datasets") print("Classifying") for classifier in data["classifiers"]: # for each classifier specified in the configuration file, execute the classification task # and return the confusion matrix confusion_matrix = Classifier.classify(data, classifier, num_classes, train_labels, train_features, test_labels, test_features) # get micro/macro precision, recall and F-Measure for current fold results = write_results_to_file(data, fold, classifier, confusion_matrix, test_labels_dict, results) time_needed = utils.elapsed_str(start_time, up_to=None) print("Time needed to run fold ", str(i), " is ", time_needed) return results
def data_split_examine(clf): ''' The fuction calculates evaluation metrics like f1_score, accuracy, precision, recall for various test data sizes Parameters: clf : a trained classification model Return: void ''' model = Classifier() for index in range(len(test_sizes)): x, y = get_x_y() x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=test_sizes[index]) classifier = getattr(model, clf)(x_train, y_train) accuracy, precision, recall, f_score, _ = evaluate( classifier, x_test, y_test) train = round((1 - test_sizes[index]) * 100) test = round(test_sizes[index] * 100) df.loc[index + 1] = [train, test, accuracy * 100, precision, recall, f_score] display(df)
import secrets from extract_graph_features import GraphFeatures from preprocessing import Preprocess from classifiers import Classifier from sample import Sample from modelEvaluation import ModelEvaluation import pandas as pd from sklearn.preprocessing import MinMaxScaler import numpy as np import json import threading import concurrent.futures gf = GraphFeatures pre = Preprocess() classifier = Classifier() sample = Sample() me = ModelEvaluation() app = Flask(__name__) app.logger.setLevel(DEBUG) app.config['SECRET_KEY'] = secrets.token_urlsafe(16) def classification(cid, mid, ml, x_train, x_test, y_train, y_test, features): # Classify here clf = {'lr': classifier.logistic_regression, 'dt': classifier.decision_tree_classifier, 'rf': classifier.random_forest, 'svm': classifier.svm, 'xgb': classifier.xg_boost, 'nn':classifier.neural_net} # removing customer id before classification; unwanted # x_train_ip = x_train.drop(['customer'], axis = 1) model = clf[ml](x_train, y_train)
def results(X_train, y_train, X_test, y_test, features="binary", D_in=200): print("\n > Logistic Regression: ") # performs logistic regression log_reg = Classifier(X_train, y_train, model="log_reg") # determines the parameters used in the grid search hyperparams = {'C': [0.01, 1, 100], 'penalty': ['l1', 'l2']} # picks the best possible model using grid search log_reg.grid_search(hyperparams) # fully train the best model log_reg.fit() # tests the accuracy of the model log_reg.score(X_test, y_test) print("\n > Linear SVM: ") # performs SVM Linear_SVM = Classifier(X_train, y_train, model="Linear_SVM") # determines the parameters used in the grid search hyperparams = {'C': [0.01, 1, 100]} # picks the best possible model using grid search Linear_SVM.grid_search(hyperparams) # fully train the best model Linear_SVM.fit() # tests the accuracy of the model Linear_SVM.score(X_test, y_test) if features == "binary": print("\n > Bernoulli Naive Bayes SVM: ") # performs Gaussian Naive Bayes Bernoulli_NBSVM = Classifier(X_train, y_train, model="Bernoulli_NBSVM") # determines the parameters used in the grid search hyperparams = {'C': [0.01, 1, 100], 'beta': [0.25, 0.5, 0.75]} # picks the best possible model using grid search Bernoulli_NBSVM.grid_search(hyperparams) # fully train the best model Bernoulli_NBSVM.fit() # tests the accuracy of the model Bernoulli_NBSVM.score(X_test, y_test) if features == "sentence_embed": print("\n > Feedforward NN:") # performs feeforward NN feedforward_NN = Classifier(X_train, y_train, "feedforward_NN", D_in) # determines the parameters used in the grid search #hyperparams = {'batch_size' : [128, 256, 512], 'epochs' : [10, 20, 50]} # picks the best possible model using grid search #feedforward_NN.grid_search(hyperparams) # fully train the best model feedforward_NN.fit() # tests the accuracy of the model feedforward_NN.score(X_test, y_test) print("\n > Gaussian Naive Bayes: ") # performs Gaussian Naive Bayes Gaussian_NB = Classifier(X_train, y_train, model="Gaussian_NB") # determines the parameters used in the grid search hyperparams = { 'priors': [None, (0.25, 0.75), (0.5, 0.5), (0.75, 0.25)] } # picks the best possible model using grid search Gaussian_NB.grid_search(hyperparams) # fully train the best model Gaussian_NB.fit() # tests the accuracy of the model Gaussian_NB.score(X_test, y_test) return (log_reg, Linear_SVM, Gaussian_NB) else: print("\n > Multinomial Naive Bayes: ") # performs Gaussian Naive Bayes Multinomial_NB = Classifier(X_train, y_train, model="Multinomial_NB") # determines the parameters used in the grid search hyperparams = { 'class_prior': [None, (0.25, 0.75), (0.5, 0.5), (0.75, 0.25)] } # picks the best possible model using grid search Multinomial_NB.grid_search(hyperparams) # fully train the best model Multinomial_NB.fit() # tests the accuracy of the model Multinomial_NB.score(X_test, y_test) return (log_reg, Linear_SVM, Multinomial_NB)
def __init__(self, fitness, configuration, controller): self.fitness = fitness self.configuration = configuration self.classifier = Classifier() self.regressor = Regressor(controller)
from classifiers import Classifier classifier = Classifier([1], ['+', '-'], 'english') # data = numpy.asarray([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]) directory_of_file = 'polarityData/rt-polaritydata/rt-polarity-neg.txt' classifier.read_text_file(directory_of_file, '-') directory_of_file = 'polarityData/rt-polaritydata/rt-polarity-pos.txt' classifier.read_text_file(directory_of_file, '+') classifier.apply_feature_set() # [1,2], [3] sets = classifier.train_test_split(0.01) # sets = [(classifier.all_feature_sets, sets[0][1])] # for extract model classifier.set_patience(300) # perceptron k = 1 for train, test in sets: if '$' in classifier.selected_split_name: classifier.selected_split_name = str( k) + classifier.selected_split_name[1:] k += 1 classifier.create_dictionary(train) classifier.save_word_dictionary() # print(classifier.hash_dictionary) for lr in [0.6]: # [0.1, 0.2, 0.25, 0.4, 0.6, 0.75, 0.8, 1] classifier.create_weights_array(1) print('{} \tLearning Rate= {}\tSplit Set: {}'.format( 'perceptron', lr, classifier.selected_split_name)) classifier.train( train, test,
class HyperparameterTuner(object): def __init__(self, sess, hidden_layers, hidden_units, num_perms, trials, epochs): self.hidden_layers = hidden_layers self.hidden_units = hidden_units self.num_perms = num_perms self.epochs = epochs self.task_list = self.create_permuted_mnist_task(num_perms) self.trial_learning_rates = [ PRNG.uniform(1e-4, 1e-3) for _ in range(0, trials) ] self.best_parameters = [] self.sess = sess self.classifier = Classifier( num_class=10, num_features=784, fc_hidden_units=[hidden_units for _ in range(hidden_layers)], apply_dropout=True) def search(self): for t in range(0, self.num_perms): queue = PriorityQueue() for learning_rate in self.trial_learning_rates: self.train_on_task(t, learning_rate, queue) self.best_parameters.append(queue.get()) self.evaluate() def evaluate(self): accuracies = [] for parameters in self.best_parameters: accuracy = self.classifier.test( sess=self.sess, model_name=parameters[1], batch_xs=self.task_list[0].test.images, batch_ys=self.task_list[0].test.labels) accuracies.append(accuracy) print(accuracies) def train_on_task(self, t, lr, queue): model_name = self.file_name(lr, t) dataset_train = self.task_list[t].train dataset_lagged = self.task_list[t - 1] if t > 0 else None model_init_name = self.best_parameters[t - 1][1] if t > 0 else None self.classifier.train(sess=self.sess, model_name=model_name, model_init_name=model_init_name, dataset=dataset_train, dataset_lagged=dataset_lagged, num_updates=(55000 // MINI_BATCH_SIZE) * self.epochs, mini_batch_size=MINI_BATCH_SIZE, log_frequency=LOG_FREQUENCY, fisher_multiplier=1.0 / lr, learning_rate=lr) accuracy = self.classifier.test( sess=self.sess, model_name=model_name, batch_xs=self.task_list[0].validation.images, batch_ys=self.task_list[0].validation.labels) queue.put((-accuracy, model_name)) def create_permuted_mnist_task(self, num_datasets): mnist = read_data_sets("MNIST_data/", one_hot=True) task_list = [mnist] for seed in range(1, num_datasets): task_list.append(self.permute(mnist, seed)) return task_list @staticmethod def permute(task, seed): np.random.seed(seed) perm = np.random.permutation(task.train._images.shape[1]) permuted = deepcopy(task) permuted.train._images = permuted.train._images[:, perm] permuted.test._images = permuted.test._images[:, perm] permuted.validation._images = permuted.validation._images[:, perm] return permuted def file_name(self, lr, t): return 'layers=%d,hidden=%d,lr=%.5f,multiplier=%.2f,mbsize=%d,epochs=%d,perm=%d' \ % (self.hidden_layers, self.hidden_units, lr, 1 / lr, MINI_BATCH_SIZE, self.epochs, t)
""" This file maps the evaluation metrics for various splits in the K-fold space""" import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.model_selection import KFold from classifiers import Classifier from evaluation import evaluate from dataloader import get_X_y from IPython.display import HTML """ for now KNeighbors will be used as it gave the highest accuracy """ model = Classifier() columns = ['Accuracy %','Precision %','Recall','F1_Score'] df = pd.DataFrame(columns = ['K_Fold']+columns) """ Cross Validation using K-Fold """ def KFold_validation(kf, split_no): X, y = get_X_y() # kf.get_n_splits(X) for train, test in kf.split(X,y): X_train, X_test = X.iloc[train], X.iloc[test] y_train, y_test = y[train], y[test] classifier = model.KNeighbors(X_train, y_train) accuracy, precision, recall,f_score, y_score = evaluate(classifier, X_test, y_test) return accuracy*100, precision*100, recall,f_score """ Evaluation metric for different K-folds are shown in tabular format """ def tabulate_kfold():
from classifiers import Classifier from langdetect import detect languages = {'en': 'english', 'fa': 'persian', 'ar': 'persian'} textfa = '''عالیه!!''' texten = '''though excessively tiresome , the uncertainty principle , as verbally pretentious as the title may be , has its handful of redeeming features , as long as you discount its ability to bore .''' lang = detect(textfa) print(lang) classifier = Classifier([1], ['+', '-', '='], languages[lang]) # first read dictionary and weights from cache if not read from file # classifier.hash_dictionary[languages[lang]] = cache[lang] classifier.load_word_dictionary() classifier.load_model_npy('perceptron-100-UNIGRAMS-0.4-persian') # print(classifier.hash_dictionary) print(classifier.predict_one(textfa)) lang = detect(texten) print(lang) classifier = Classifier([1], ['+', '-'], languages[lang]) # first read dictionary and weights from cache if not read from file # classifier.hash_dictionary[languages[lang]] = cache[lang] classifier.load_word_dictionary() classifier.load_model_npy( 'perceptron-128-UNIGRAMS-0.6_test_size-0.01_random_state-0_shuffle-english'
# Creating bag of words from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer() X = cv.fit_transform(corpus).toarray() y = dataset.iloc[:, -1].values # Spliting of dataset X_train, X_test, y_train, y_test = Preprocessing.datasplit(X, y, test_size=0.1, random_state=0) # Classifiers classifierRFC = Classifier.RFC(X_train, y_train, n_estimators=13, criterion='entropy') classifierkNN = Classifier.kNN(X_train, y_train, n_neighbors=8, metric='minkowski') classifierLR = Classifier.LR(X_train, y_train) classifierGaussNB = Classifier.GaussNB(X_train, y_train) classifierDTC = Classifier.DTC(X_train, y_train, criterion='entropy') classifierSVM = Classifier.SuppVM(X_train, y_train, kernel='rbf') # Prediction using the test set from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score """ Random Forest Classifier """ y_predRFC = classifierGaussNB.predict(X_test) cm_RFC = confusion_matrix(y_test, y_predRFC)
def train_keras_classifier(): from cifar10_classifier import Classifier from keras.optimizers import SGD from keras import backend as K batch_size = 128 epochs = 50 data_augmentation = True opt = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) with tf.variable_scope('conv') as scope: model = Classifier().model model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) # load data and start training if data_augmentation: print('Using real-time data augmentation.') datagen, (x_train, y_train), (x_test, y_test) = data_loader.load_augmented_data() model.fit_generator(datagen.flow(x_train, y_train, batch_size=batch_size), epochs=epochs, validation_data=(x_test, y_test), workers=4, callbacks=[SGDLearningRateTracker()]) else: print('Not using data augmentation.') (x_train, y_train), (x_test, y_test) = data_loader.load_original_data() model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test), shuffle=True) # save as tensorflow model if not os.path.isdir(SAVE_DIR): os.makedirs(SAVE_DIR) model.save(CLASSIFIER_PATH) from keras.backend import get_session sess = get_session() saver = tf.train.Saver() saver.save(sess, PRETRAINED_PATH) print('Saved trained model at %s ' % (PRETRAINED_PATH)) # evaluate on test set scores = model.evaluate(x_test, y_test, verbose=1) print('Test loss:', scores[0])
def __init__(self, configuration, controller, fitness): super(DummySurrogateModel, self).__init__(configuration, controller, fitness) self.regressor = Regressor(controller, configuration) self.classifier = Classifier()
def run_classifier(self): classifier_out_dir = self.clinvar_ml_out_dir + '/' + '_'.join( self.genomic_classes) if not os.path.exists(classifier_out_dir): os.makedirs(classifier_out_dir) # @anchor-1 #out_models_dir = self.base_out_models_dir + '/' + 'intergenic_utr_lincrna_ucne_vista' #+ '_'.join(self.genomic_classes) out_models_dir = self.base_out_models_dir + '/' + '_'.join( self.genomic_classes) """ -- JARVIS: > "intergenic_utr_lincrna_ucne_vista" is the best model for 'utr'--D3000-struct (0.675) > "intergenic_utr_lincrna_ucne_vista" is the best model for 'intergenic,utr'--D3000-struct (0.649) > "ccds" is (probably) the best model for 'ccds'--D3000-struct (0.565) """ if not os.path.exists(out_models_dir): os.makedirs(out_models_dir) classifier = Classifier( self.Y_label, classifier_out_dir, out_models_dir, base_score=self.base_score, model_type=self.model_type, use_only_base_score=self.use_only_base_score, include_vcf_extracted_features=self.include_vcf_extracted_features, exclude_base_score=self.exclude_base_score, use_pathogenicity_trained_model=use_pathogenicity_trained_model, use_conservation_trained_model=use_conservation_trained_model, predict_on_test_set=predict_on_test_set) classifier.preprocess_data(self.df) #print(self.df.info()) # --- Get correlations between features --- features_df = self.df.drop([ 'chr', 'start', 'end', 'clinvar_annot', 'common_variants', 'common_vs_all_variants_ratio', 'all_variants', 'mean_ac', 'mean_af', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'bin_5', 'bin_6' ], axis=1) #print(features_df.corr()) fig, ax = plt.subplots(figsize=(19, 15)) corr = features_df.corr() sns_plot = sns.heatmap(corr, cmap=plt.cm.RdBu, linecolor='white', linewidths=0.1, square=True, ax=ax, vmax=1.0, vmin=-1.0) sns_plot.get_figure().savefig(self.clinvar_ml_out_dir + '/Feature-correlation_matrix.' + '_'.join(self.genomic_classes) + '.pdf', format='pdf', bbox_inches='tight') # --------------------------------------- classifier.init_model() classifier.run_classification_with_cv() self.score_print_name = classifier.score_print_name self.mean_tpr = classifier.mean_tpr self.mean_fpr = classifier.mean_fpr self.mean_auc = classifier.mean_auc self.metrics_list = classifier.metrics_list self.y_label_lists = classifier.y_label_lists self.y_proba_lists = classifier.y_proba_lists plt.close()
def cal_point_type(x_train, x_val, y_train, y_val): """ Calculates the misclassified values of each classifier Parameters: x_train: array:like, shape(n_train_samples, n_features) x_val: array:like, shape(n_val_samples, n_features) y_train: of length n_train_samples y_val: of length n_val_samples Returns: outliers: array:like point_score: dictionary easy_points: array:like """ # list of all classifiers classifiers = [ "KNeighbors", "Random_Forest", "svm_classifier", "Gaussian", "Decision_Tree", "Logistic_Reg", ] model = Classifier() # create dictionay to store misclassified values for each classifier err_indexes = {} correct_indexes = {} wt = {} for clf in classifiers: train_clf = getattr(model, clf)(x_train, y_train) y_score = train_clf.predict(x_val) # get the indexes of misclassified values err_indexes[clf] = np.where(y_score != y_val) correct_indexes[clf] = np.where(y_score == y_val) # associate wt to each model, based on its accuracy acc = accuracy_score(y_val, y_score) wt[clf] = 1 / (1 - np.power(acc, 2)) # calculating outliers outliers = err_indexes["KNeighbors"] for clf in classifiers: outliers = np.intersect1d(outliers, err_indexes[clf]) # calculating points with trivial info : the misclassified points by model with high accuracy are a subset of misclassified points by models with less accuracy. # print('Points associated with each model :', wt) # correctly by k:nn but not by random:forest s1 = wt["KNeighbors"] - wt["Random_Forest"] pt1 = np.intersect1d(correct_indexes["KNeighbors"], err_indexes["Random_Forest"]) # correctly by random:forest not by decision:tree s2 = wt["Random_Forest"] - wt["Decision_Tree"] pt2 = np.intersect1d(correct_indexes["Random_Forest"], err_indexes["Decision_Tree"]) # correctly by decision:tree not by logistic regression s3 = wt["Decision_Tree"] - wt["Logistic_Reg"] pt3 = np.intersect1d(correct_indexes["Decision_Tree"], err_indexes["Logistic_Reg"]) # correctly by logistic regression not by Gaussian s4 = wt["Logistic_Reg"] - wt["Gaussian"] pt4 = np.intersect1d(correct_indexes["Logistic_Reg"], err_indexes["Gaussian"]) # correctly by Gaussian not by svm s5 = wt["Gaussian"] - wt["svm_classifier"] pt5 = np.intersect1d(correct_indexes["Gaussian"], err_indexes["svm_classifier"]) point_score = { "p1": (s1, pt1), "p2": (s2, pt2), "p3": (s3, pt3), "p4": (s4, pt4), "p5": (s5, pt5), } # calculating easy:points easy_points = correct_indexes["KNeighbors"] for clf in classifiers: easy_points = np.intersect1d(easy_points, correct_indexes[clf]) return outliers, point_score, easy_points
def run(config, num_batches, batch_size, model_name, class_model_name, ofile, threshold, num_workers, epochs, multi_gans, gan_weights, trunc_norm, fixed_dset, transform, filter_samples): # Instanciating generator config['G_batch_size'] = batch_size generator = GeneratorWrapper(config, model_name, trunc_norm, multi_gans, gan_weights) generator_fn = generator.gen_batch if gan_weights: print('Using GAN weights (multi-GAN setting): ', str(gan_weights)) # Instanciating filtering classifier if filter_samples: print('Using ResNet20 weights: %s.pth' % class_model_name) filter_net = Classifier('resnet20', config['n_classes']) filter_net.load(class_model_name) filter_fn = filter_net.filter else: filter_fn = None # Creating a filtered loader using the classifier num_classes = config['n_classes'] loader = FilteredLoader(generator_fn, filter_fn, num_classes, num_batches, batch_size, threshold, num_workers, fixed_dset, transform) print('Training using %d generated images per epoch' % loader.train_length()) # Creating a blank ResNet net = resnet20(config['n_classes'], width=64).to('cuda') # Initializing loss functions, optimizer, learning rate scheduler cross_entropy = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.9, weight_decay=0.0001) scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[100, 150]) # Evaluating the model on the test set test_loader = utils.make_test_loader(config['dataset'], batch_size, transforms.Normalize(*utils.norm_vals)) # Training the model t1 = utils.ctime() best_acc = 0.0 for epoch in range(epochs): print('Epoch: %3d' % (epoch+1), end=" ") train(net, loader, batch_size, optimizer, cross_entropy) scheduler.step() acc = evaluate(net, test_loader) best_acc = max(acc, best_acc) loader.reset() print('Val acc: %4.2f %% ' % evaluate(net, test_loader), ' | Best acc: %4.2f %%\n' % best_acc) tt = utils.ctime() - t1 print('Finished training, total time: %4.2fs' % tt) print('Best accuracy achieved: %4.5f %%' % best_acc) # Saving output model output = './output/%s.pth' % ofile print('Saving trained classifier in %s' % output) torch.save(net.state_dict(), output)
def main(argv): # fake_scores = fake_score_generator(death_dataset) # death_dataset = add_feature(death_dataset, fake_scores) # names = {} # names_dict = create_name_dict(names_list) # filename = 'death' # names_list = get_file_names(filename) count_flag = False gender_flag = False proximity_flag = False tf_idf_flag = False graph_flag = False run_file_flag = False ablation_flag = False best_subset_flag = False books_inverted_index = Index() # do a fresh indexing and save # books_inverted_index.add_all_books() # save_index(books_inverted_index) # or load from previous indexing if len(sys.argv) > 1: for arg in sys.argv[1:]: if arg == 'index_books': # do a fresh indexing and save books_inverted_index.add_all_books() save_index(books_inverted_index) elif arg == 'load_books': print( "loading books directly as inverted_index object into the program" ) books_inverted_index = load_index() elif arg == 'count_features': count_flag = True elif arg == 'gender_feature': gender_flag = True elif arg == 'proximity_feature': proximity_flag = True elif arg == 'tf_idf': tf_idf_flag = True elif arg == 'ablation': ablation_flag = True elif arg == 'best_subset': best_subset_flag = True elif arg == 'graph_feature': graph_flag = True graph = Graph() elif arg == 'all_features': count_flag = True gender_flag = True proximity_flag = True tf_idf_flag = True graph_flag = True graph = Graph() elif arg == 'run_file': run_file_flag = True elif arg == 'quick': books_inverted_index = load_index() count_flag = True gender_flag = True proximity_flag = True tf_idf_flag = False graph_flag = False graph = Graph() else: sys.exit("Wrong usage!") else: books_inverted_index = load_index() count_flag = True gender_flag = True proximity_flag = True tf_idf_flag = True graph_flag = True graph = Graph() classifier = Classifier() classifier.read_separate_train_test_files(evaluate=True) # classifier.split_data() # reading names for training and test sets training_names = classifier.get_names(training=True) test_names = classifier.get_names(test=True) # creating features for the training set features_index, training_features = create_features( training_names, books_inverted_index, graph, count_flag, gender_flag, proximity_flag, tf_idf_flag, graph_flag) # creating features for the test set features_index, test_features = create_features( test_names, books_inverted_index, graph, count_flag, gender_flag, proximity_flag, tf_idf_flag, graph_flag) classifier.set_features(training_features, test_features) classifier.save_features() y_pred_log = classifier.logistic_regression() # classifier.svc_polynomial() # classifier.svc_guassian_kernel() y_pred_svc = classifier.svc_sigmoid() y_pred_dt = classifier.decision_tree() y_pred_knn = classifier.k_nearest_neighbors() y_pred_nb = classifier.naive_base() # create the run file out of the knn's results if run_file_flag == True: classifier.make_new_run_file(y_pred_dt, 'dt') classifier.make_new_run_file(y_pred_log, 'logit') classifier.make_new_run_file(y_pred_svc, 'svc') classifier.make_new_run_file(y_pred_knn, 'knn') classifier.make_new_run_file(y_pred_nb, 'naive') # classifier.feature_selection() classifier.plot_f1_scores(classifier.method_name, classifier.f_scores, plot_title='Death Prediction', file_name='f1_scores') y_pred_list = [y_pred_log, y_pred_svc, y_pred_dt, y_pred_knn, y_pred_nb] classifier.plot_with_error_bars('death', y_pred_list, classifier.method_name, 'Death Prediction', 'death_fscore_error') if gender_flag: gender_training_features = training_features[2] gender_test_features = test_features[2] classifier.evaluate_gender_prediction(gender_training_features, gender_test_features, print_flag=True) if ablation_flag: ablation_test(classifier, features_index, training_features, test_features) if best_subset_flag: best_subset_selection(classifier, training_features, test_features)
from classifiers import Classifier classifier = Classifier([1], ['+', '-', '='], 'persian') directory = 'Persian_Comments/' classifier.read_excel_file(directory, ['comment', 'orientation']) classifier.apply_feature_set() # [1,2], [3] sets = classifier.train_test_split(0.005) # sets = [(classifier.all_feature_sets, classifier.all_feature_sets[0])] #for extract model classifier.set_patience(300) # perceptron k = 1 for train, test in sets: if '$' in classifier.selected_split_name: classifier.selected_split_name = str( k) + classifier.selected_split_name[1:] k += 1 classifier.create_dictionary(train) classifier.save_word_dictionary() for lr in [0.1, 0.2, 0.25, 0.4, 0.6, 0.75, 0.8, 1]: # [0.1, 0.2, 0.25, 0.4, 0.6, 0.75, 0.8, 1] classifier.create_weights_array(1) print('{} \tLearning Rate= {}\tSplit Set: {}'.format( 'perceptron', lr, classifier.selected_split_name)) classifier.train( train, test, classifier='perceptron',
def train_classifier(): from classifiers import Classifier max_epoch = 50 batch_size = 128 imgsize = 32 weight_decay = 0 # disabled num_classes = 10 data_augmentation = False print('WARNING: data augmentation not implemented. ' + \ 'For better model performance, please use train_keras_classifier instead') response = raw_input('Do you wish to continue? (y/N)') if response.lower() not in ['y', 'yes']: return if data_augmentation: pass else: (x_train, y_train), (x_test, y_test) = data_loader.load_original_data() data_train = tf.data.Dataset.from_tensor_slices((x_train, y_train)) data_trian = data_train.shuffle(50000).repeat().batch(128) iter_train = data_train.make_initializable_iterator() x = tf.placeholder(tf.float32, [batch_size, imgsize, imgsize, 3]) y_ = tf.placeholder(tf.float32, [batch_size, num_classes]) regularizer = tf.contrib.layers.l2_regularizer(scale=weight_decay) with tf.variable_scope('conv') as scope: model = Classifier(x, regularizer, expand_dim=False) loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(logits=model.logits, labels=y_)) reg_loss = tf.losses.get_regularization_loss() loss += reg_loss eval_acc = accuracy(model.logits, y_) optimizer = tf.train.MomentumOptimizer(learning_rate=0.01, momentum=0.9, use_nesterov=True) optim_step = optimizer.minimize(loss=loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(iter_train.initializer) next_batch = iter_train.get_next() for n_epoch in range(max_epoch): for i in range(50000 / batch_size): batch = sess.run(next_batch) _, acc_val, loss_val = sess.run([optim_step, eval_acc, loss], feed_dict={ x: batch[0], y_: batch[1] }) if i % 100 == 0: print("Epoch: %d, Step: %d, Acc: %f, Loss: %f" % (n_epoch, i, acc_val, loss_val)) acc_avg = loss_avg = 0 test_batch_num = len(y_test) / batch_size # validate on test set for i in range(test_batch_num): acc_val, loss_val = sess.run( [eval_acc, loss], feed_dict={ x: x_test[i * batch_size:(i + 1) * batch_size], y_: y_test[i * batch_size:(i + 1) * batch_size] }) acc_avg += acc_val loss_avg += loss_val print('Test accuracy: %f, loss: %f' % (acc_avg / test_batch_num, loss_avg / test_batch_num)) saver = tf.train.Saver() saver.save(sess, CLASSIFIER_PATH) print('Saved trained model at %s ' % CLASSIFIER_PATH)