Exemple #1
0
class StateWikiClassifier():
  DATABASE = "us_twitter.db"
  def __init__(self):
    db_mgr = DataManager(self.DATABASE)
    self.train_tweets, self.train_labels = db_mgr.select_wikipedia_train()
    self.vectorizer = get_vectorizer("tfidf", min_df=1)
    self.nb = Classifier(classifier="nb")
    self.train_data = self.vectorizer.fit_transform(self.train_tweets)
    self.nb.fit(self.train_data, self.train_labels)

  def predict(self, text):
    text = text.lower()
    results = self.nb.predict(self.vectorizer.transform([text]))
    return results[0], FIPS_DEFINITIONS[results[0]]
Exemple #2
0
def main(argv):
    image_path = ''
    clsf = 'knn'

    try:
        opts, args = getopt.getopt(argv, 'hi:c:',
                                   ['help=', 'image=', 'classifier='])
    except getopt.GetoptError:
        print '\nusage: python sum_digits.py -i <image>\n'
        sys.exit(2)

    for opt, arg in opts:
        if opt in ('-h', '--help'):
            print '\nusage: python sum_digits.py -i <image>'
            print '\nOptions:'
            print '\t-h [--help],       : Show help'
            print '\t-i [--image],      : Path to the image'
            print '\t-c [--classifier], : Specify classifier. KNN (default) or CNN'
            sys.exit()
        elif opt in ('-i', '--image'):
            image_path = arg
        else:
            clsf = arg.lower()

    print '\n=> Initializing classifier ...'

    # Initialize classifier, default classifier is kNN
    classifier = Classifier(clsf)

    print '=> Calculating sum ...'
    print '\nSum: %.1f' % sum_digits(image_path, classifier)
Exemple #3
0
class SurrogateModel(object):

    def __init__(self, fitness, configuration, controller):
        self.fitness = fitness
        self.configuration = configuration
        self.classifier = Classifier()
        self.regressor = Regressor(controller)

    def train(self, pop):
        raise NotImplementedError('SurrogateModel is an abstract class, this '
                                  'should not be called.')

    def model_particles(self, particles):
        MU, S2 = self.regressor.predict(particles)
        return self.classifier.predict(particles), MU, S2

    def add_training_instance(self, part, code, fitness):
        pass

    def __getstate__(self):
        # Don't pickle fitness and configuration
        d = dict(self.__dict__)
        del d['fitness']
        del d['configuration']
        return d
Exemple #4
0
 def __init__(self):
   db_mgr = DataManager(self.DATABASE)
   self.train_tweets, self.train_labels = db_mgr.select_wikipedia_train()
   self.vectorizer = get_vectorizer("tfidf", min_df=1)
   self.nb = Classifier(classifier="nb")
   self.train_data = self.vectorizer.fit_transform(self.train_tweets)
   self.nb.fit(self.train_data, self.train_labels)
Exemple #5
0
 def __init__(self, sess, hidden_layers, hidden_units, num_perms, trials,
              epochs):
     self.hidden_layers = hidden_layers
     self.hidden_units = hidden_units
     self.num_perms = num_perms
     self.epochs = epochs
     self.task_list = self.create_permuted_mnist_task(num_perms)
     self.trial_learning_rates = [
         PRNG.uniform(1e-4, 1e-3) for _ in range(0, trials)
     ]
     self.best_parameters = []
     self.sess = sess
     self.classifier = Classifier(
         num_class=10,
         num_features=784,
         fc_hidden_units=[hidden_units for _ in range(hidden_layers)],
         apply_dropout=True)
Exemple #6
0
def train(dataSetTrain, dataSetTest, dataSetTest2, dataSetTest3):

    # Start an Interactive session
    sess = tf.InteractiveSession()

    if (FLAGS.hidden3 == -1):
        classifier = Classifier(num_class=10,
                                num_features=784,
                                fc_hidden_units=[FLAGS.hidden1, FLAGS.hidden2],
                                apply_dropout=True,
                                checkpoint_path=FLAGS.checkpoints_dir)
    else:
        classifier = Classifier(
            num_class=10,
            num_features=784,
            fc_hidden_units=[FLAGS.hidden1, FLAGS.hidden2, FLAGS.hidden3],
            apply_dropout=True,
            checkpoint_path=FLAGS.checkpoints_dir)
    print('\nTraining on DataSet started...')
    print('____________________________________________________________')
    print(time.strftime('%X %x %Z'))

    print("Total updates: %s " % ((55000 // FLAGS.batch_size) * FLAGS.epochs))
    testdatalist = [dataSetTest, dataSetTest2, dataSetTest3]

    if FLAGS.test2_classes == None and FLAGS.test3_classes == None:
        testdatalist = [dataSetTest]
    Classifier.train_mod(
        classifier,
        sess=sess,
        model_name=FLAGS.save_model if FLAGS.save_model != None else "",
        model_init_name=FLAGS.load_model,
        dataset=dataSetTrain,
        num_updates=(FLAGS.max_steps * FLAGS.batch_size * FLAGS.epochs //
                     FLAGS.batch_size) * FLAGS.epochs,
        dataset_lagged=[0],
        mini_batch_size=FLAGS.batch_size,
        log_frequency=LOG_FREQUENCY,
        fisher_multiplier=1.0 / FLAGS.learning_rate,
        learning_rate=FLAGS.learning_rate,
        testing_data_sets=testdatalist,
        plot_files=[FLAGS.plot_file, FLAGS.plot_file2, FLAGS.plot_file3],
        start_at_step=FLAGS.start_at_step)
def kfold(orig_data, orig_labels, test_data, clf: classifiers.Classifier):
    results = []
    predictions = []
    best_k = 0
    for k in range(len(orig_data)):
        print("Iteration", k + 1, "over", len(orig_data))
        data, labels = orig_data[:], orig_labels[:]
        val_data = data.pop(k)
        val_labels = labels.pop(k)
        train_data = np.concatenate(data)
        train_labels = np.concatenate(labels)
        print("Fitting...")
        clf.fit(train_data, train_labels)
        print("Evaluating...")
        results.append(clf.evaluate(val_data, val_labels))
        print(results[k])
        print("Predicting...")
        predictions.append(clf.predict(test_data))
        print(predictions[k])
        if results[k]["Accuracy"] > results[best_k]["Accuracy"]:
            best_k = k
    return results[best_k], predictions[best_k]
Exemple #8
0
def train_and_test(df, preds, seed):
    '''
    Run a single trial:
        Shuffle df and split it into training and testing subsets
        Train a new model based on the training sets
        Test the model with testing set
        Add prediction data into preds array

    :param df: dataframe with full set of all available samples
        columns: id, cat1 (primary class), cat2 (secondary),
        title, titlen (claened title)
    :param preds: an array of predictions, each prediction is a dictionary
        cat: true category, pred: predicted category,
        conf: model confidence in its prediction (< 1.0),
        title: actual title of the chapter/sample
    :return: average testing accuracy
    '''
    ret = {}

    # PREPS
    # randomly split the dataset
    df = utils.split_dataset(
        df,
        settings.CAT_DEPTH,
        settings.TRAIN_PER_CLASS_MIN,
        settings.TEST_PER_CLASS,
        settings.VALID_PER_CLASS,
    )

    # TRAIN
    classifier = Classifier.from_name(settings.CLASSIFIER, seed)
    classifier.set_datasets(df, titles_out_path)
    classifier.train()

    df_test = classifier.df_test

    if settings.EVALUATE_TRAINING_SET:
        evaluate_model(classifier,
                       classifier.df_train,
                       display_prefix='TRAIN = ')
    accuracy = evaluate_model(classifier,
                              df_test,
                              preds,
                              display_prefix='TEST  = ')
    classifier_key = utils.get_exp_key(classifier)

    classifier.release_resources()

    return classifier_key, accuracy, classifier.df_train
def train_classification_models(X, y, ground_truth) -> None:
    """

    :param X:
    :param y:
    :param ground_truth:
    :return:
    """
    # Start recording time.
    start_time = time.time()

    # Create classifier model instance.
    Classifier(config.model, X, y, ground_truth)

    # Print training runtime.
    print_runtime(round(time.time() - start_time, 2))
class DummySurrogateModel(SurrogateModel):

    ## TODO - add dummy regressor/classifier
    def __init__(self, configuration, controller, fitness):
        super(DummySurrogateModel, self).__init__(configuration,
                                                   controller,
                                                   fitness)
        self.regressor = Regressor(controller, configuration)
        self.classifier = Classifier()

    def get_regressor(self):
        return self.regressor
                                  
    def get_classifier(self):
        return self.classifier
        
    def predict(self, particles):
        MU, S2 = self.regressor.predict(particles)
        return self.classifier.predict(particles), MU, S2

    def train(self, hypercube):
        self.was_trained = True
        return True

    def model_particle(self, particle):
        return 0, 0, 0
        
    def contains_training_instance(self, part):
        return False

    def model_failed(self, part):
        return False
        
    def get_state_dictionary(self):
        return {}
        
    def set_state_dictionary(self, dict):
        pass
        
    def get_copy(self):
        model_copy = DummySurrogateModel(self.configuration, self.controller)
        return model_copy
Exemple #11
0
def prepare_dataset():
    '''Convert input .txt o .csv into a .csv file with all the necessary
    columns for training and testing classification models.'''

    # # experimental work done on first, small dataset.
    # utils.extract_transcripts_from_pdfs()
    # utils.learn_embeddings_from_transcipts()

    # load titles file into dataframe
    df_all = pd.DataFrame()
    for fileinfo in settings.DATASET_FILES:
        if not (fileinfo['can_train'] or fileinfo['can_test']):
            continue

        titles_path = utils.get_data_path('in', fileinfo['filename'])

        if not os.path.exists(titles_path):
            utils.log_error(
                'The training file ({0}) is missing. See README.md for more info.'
                .format(titles_path))

        df = utils.read_df_from_titles(titles_path,
                                       use_full_text=settings.FULL_TEXT)
        for flag in ['can_train', 'can_test']:
            df[flag] = fileinfo[flag]
        df_all = df_all.append(df, ignore_index=True)

    # save that as a csv
    df_all.to_csv(
        titles_out_path,
        columns=['id', 'cat1', 'cat2', 'title', 'can_train', 'can_test'],
        index=False)

    # normalise the title
    classifier = Classifier.from_name(settings.CLASSIFIER, None)
    df_all['titlen'] = df_all['title'].apply(lambda v: classifier.tokenise(v))
    classifier.release_resources()

    return df_all
def run_fold(data, i, results):
    """
    Used in main function to run each fold
    :param data: the dictionary with all the configurations
    :param i: the fold number
    :param results: the dictionary containing the results for each classifier
    :return: the results dictionary
    """
    print("Running fold: ", i)
    start_time = utils.get_datetime()
    fold = "fold" + str(i)
    print("Reading and converting arff files")
    # use a converter for arff files to get pandas DataFrames
    train_df, test_df = dfc.convert_arff_to_dataframe(data, fold)
    print("Got train and test dataframes")
    print("Creating numpy arrays")
    # separate labels from features and replace labels with numbers
    train_labels, train_features, train_labels_dict = dfc.get_features_labels_arrays(
        train_df)
    test_labels, test_features, test_labels_dict = dfc.get_features_labels_arrays(
        test_df)
    num_classes = len(train_labels_dict)
    print("Got labels and features for train and test datasets")
    print("Classifying")
    for classifier in data["classifiers"]:
        # for each classifier specified in the configuration file, execute the classification task
        # and return the confusion matrix
        confusion_matrix = Classifier.classify(data, classifier, num_classes,
                                               train_labels, train_features,
                                               test_labels, test_features)
        # get micro/macro precision, recall and F-Measure for current fold
        results = write_results_to_file(data, fold, classifier,
                                        confusion_matrix, test_labels_dict,
                                        results)
    time_needed = utils.elapsed_str(start_time, up_to=None)
    print("Time needed to run fold ", str(i), " is ", time_needed)
    return results
def data_split_examine(clf):
    '''
	The fuction calculates evaluation metrics like f1_score, accuracy, precision, recall for various test data sizes

	Parameters:
		clf : a trained classification model

	Return:
		void
	'''
    model = Classifier()
    for index in range(len(test_sizes)):
        x, y = get_x_y()
        x_train, x_test, y_train, y_test = train_test_split(
            x, y, test_size=test_sizes[index])
        classifier = getattr(model, clf)(x_train, y_train)
        accuracy, precision, recall, f_score, _ = evaluate(
            classifier, x_test, y_test)
        train = round((1 - test_sizes[index]) * 100)
        test = round(test_sizes[index] * 100)
        df.loc[index +
               1] = [train, test, accuracy * 100, precision, recall, f_score]

    display(df)
import secrets
from extract_graph_features import GraphFeatures
from preprocessing import Preprocess
from classifiers import Classifier
from sample import Sample
from modelEvaluation import ModelEvaluation
import pandas as pd 
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import json
import threading
import concurrent.futures

gf = GraphFeatures
pre = Preprocess()
classifier = Classifier()
sample = Sample()
me = ModelEvaluation()

app = Flask(__name__)
app.logger.setLevel(DEBUG)
app.config['SECRET_KEY'] = secrets.token_urlsafe(16)


def classification(cid, mid, ml, x_train, x_test, y_train, y_test, features):
	# Classify here
	clf = {'lr': classifier.logistic_regression, 'dt': classifier.decision_tree_classifier, 'rf': classifier.random_forest, 'svm': classifier.svm, 'xgb': classifier.xg_boost, 'nn':classifier.neural_net}
	# removing customer id before classification; unwanted
	# x_train_ip = x_train.drop(['customer'], axis = 1)
	model = clf[ml](x_train, y_train)
Exemple #15
0
def results(X_train, y_train, X_test, y_test, features="binary", D_in=200):

    print("\n  > Logistic Regression: ")
    # performs logistic regression
    log_reg = Classifier(X_train, y_train, model="log_reg")
    # determines the parameters used in the grid search
    hyperparams = {'C': [0.01, 1, 100], 'penalty': ['l1', 'l2']}
    # picks the best possible model using grid search
    log_reg.grid_search(hyperparams)
    # fully train the best model
    log_reg.fit()
    # tests the accuracy of the model
    log_reg.score(X_test, y_test)

    print("\n  > Linear SVM: ")
    # performs SVM
    Linear_SVM = Classifier(X_train, y_train, model="Linear_SVM")
    # determines the parameters used in the grid search
    hyperparams = {'C': [0.01, 1, 100]}
    # picks the best possible model using grid search
    Linear_SVM.grid_search(hyperparams)
    # fully train the best model
    Linear_SVM.fit()
    # tests the accuracy of the model
    Linear_SVM.score(X_test, y_test)

    if features == "binary":
        print("\n  > Bernoulli Naive Bayes SVM: ")
        # performs Gaussian Naive Bayes
        Bernoulli_NBSVM = Classifier(X_train, y_train, model="Bernoulli_NBSVM")
        # determines the parameters used in the grid search
        hyperparams = {'C': [0.01, 1, 100], 'beta': [0.25, 0.5, 0.75]}
        # picks the best possible model using grid search
        Bernoulli_NBSVM.grid_search(hyperparams)
        # fully train the best model
        Bernoulli_NBSVM.fit()
        # tests the accuracy of the model
        Bernoulli_NBSVM.score(X_test, y_test)

    if features == "sentence_embed":
        print("\n  > Feedforward NN:")
        # performs feeforward NN
        feedforward_NN = Classifier(X_train, y_train, "feedforward_NN", D_in)
        # determines the parameters used in the grid search
        #hyperparams = {'batch_size' : [128, 256, 512], 'epochs' : [10, 20, 50]}
        # picks the best possible model using grid search
        #feedforward_NN.grid_search(hyperparams)
        # fully train the best model
        feedforward_NN.fit()
        # tests the accuracy of the model
        feedforward_NN.score(X_test, y_test)

        print("\n  > Gaussian Naive Bayes: ")
        # performs Gaussian Naive Bayes
        Gaussian_NB = Classifier(X_train, y_train, model="Gaussian_NB")
        # determines the parameters used in the grid search
        hyperparams = {
            'priors': [None, (0.25, 0.75), (0.5, 0.5), (0.75, 0.25)]
        }
        # picks the best possible model using grid search
        Gaussian_NB.grid_search(hyperparams)
        # fully train the best model
        Gaussian_NB.fit()
        # tests the accuracy of the model
        Gaussian_NB.score(X_test, y_test)

        return (log_reg, Linear_SVM, Gaussian_NB)

    else:
        print("\n  > Multinomial Naive Bayes: ")
        # performs Gaussian Naive Bayes
        Multinomial_NB = Classifier(X_train, y_train, model="Multinomial_NB")
        # determines the parameters used in the grid search
        hyperparams = {
            'class_prior': [None, (0.25, 0.75), (0.5, 0.5), (0.75, 0.25)]
        }
        # picks the best possible model using grid search
        Multinomial_NB.grid_search(hyperparams)
        # fully train the best model
        Multinomial_NB.fit()
        # tests the accuracy of the model
        Multinomial_NB.score(X_test, y_test)

        return (log_reg, Linear_SVM, Multinomial_NB)
Exemple #16
0
 def __init__(self, fitness, configuration, controller):
     self.fitness = fitness
     self.configuration = configuration
     self.classifier = Classifier()
     self.regressor = Regressor(controller)
from classifiers import Classifier
classifier = Classifier([1], ['+', '-'], 'english')
# data = numpy.asarray([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]])

directory_of_file = 'polarityData/rt-polaritydata/rt-polarity-neg.txt'
classifier.read_text_file(directory_of_file, '-')
directory_of_file = 'polarityData/rt-polaritydata/rt-polarity-pos.txt'
classifier.read_text_file(directory_of_file, '+')
classifier.apply_feature_set()  # [1,2], [3]
sets = classifier.train_test_split(0.01)
# sets = [(classifier.all_feature_sets, sets[0][1])] # for extract model

classifier.set_patience(300)

# perceptron
k = 1
for train, test in sets:
    if '$' in classifier.selected_split_name:
        classifier.selected_split_name = str(
            k) + classifier.selected_split_name[1:]
        k += 1
    classifier.create_dictionary(train)
    classifier.save_word_dictionary()
    # print(classifier.hash_dictionary)
    for lr in [0.6]:  # [0.1, 0.2, 0.25, 0.4, 0.6, 0.75, 0.8, 1]
        classifier.create_weights_array(1)
        print('{} \tLearning Rate= {}\tSplit Set: {}'.format(
            'perceptron', lr, classifier.selected_split_name))
        classifier.train(
            train,
            test,
Exemple #18
0
class HyperparameterTuner(object):
    def __init__(self, sess, hidden_layers, hidden_units, num_perms, trials,
                 epochs):
        self.hidden_layers = hidden_layers
        self.hidden_units = hidden_units
        self.num_perms = num_perms
        self.epochs = epochs
        self.task_list = self.create_permuted_mnist_task(num_perms)
        self.trial_learning_rates = [
            PRNG.uniform(1e-4, 1e-3) for _ in range(0, trials)
        ]
        self.best_parameters = []
        self.sess = sess
        self.classifier = Classifier(
            num_class=10,
            num_features=784,
            fc_hidden_units=[hidden_units for _ in range(hidden_layers)],
            apply_dropout=True)

    def search(self):
        for t in range(0, self.num_perms):
            queue = PriorityQueue()
            for learning_rate in self.trial_learning_rates:
                self.train_on_task(t, learning_rate, queue)
            self.best_parameters.append(queue.get())
            self.evaluate()

    def evaluate(self):
        accuracies = []
        for parameters in self.best_parameters:
            accuracy = self.classifier.test(
                sess=self.sess,
                model_name=parameters[1],
                batch_xs=self.task_list[0].test.images,
                batch_ys=self.task_list[0].test.labels)
            accuracies.append(accuracy)
        print(accuracies)

    def train_on_task(self, t, lr, queue):
        model_name = self.file_name(lr, t)
        dataset_train = self.task_list[t].train
        dataset_lagged = self.task_list[t - 1] if t > 0 else None
        model_init_name = self.best_parameters[t - 1][1] if t > 0 else None
        self.classifier.train(sess=self.sess,
                              model_name=model_name,
                              model_init_name=model_init_name,
                              dataset=dataset_train,
                              dataset_lagged=dataset_lagged,
                              num_updates=(55000 // MINI_BATCH_SIZE) *
                              self.epochs,
                              mini_batch_size=MINI_BATCH_SIZE,
                              log_frequency=LOG_FREQUENCY,
                              fisher_multiplier=1.0 / lr,
                              learning_rate=lr)
        accuracy = self.classifier.test(
            sess=self.sess,
            model_name=model_name,
            batch_xs=self.task_list[0].validation.images,
            batch_ys=self.task_list[0].validation.labels)
        queue.put((-accuracy, model_name))

    def create_permuted_mnist_task(self, num_datasets):
        mnist = read_data_sets("MNIST_data/", one_hot=True)
        task_list = [mnist]
        for seed in range(1, num_datasets):
            task_list.append(self.permute(mnist, seed))
        return task_list

    @staticmethod
    def permute(task, seed):
        np.random.seed(seed)
        perm = np.random.permutation(task.train._images.shape[1])
        permuted = deepcopy(task)
        permuted.train._images = permuted.train._images[:, perm]
        permuted.test._images = permuted.test._images[:, perm]
        permuted.validation._images = permuted.validation._images[:, perm]
        return permuted

    def file_name(self, lr, t):
        return 'layers=%d,hidden=%d,lr=%.5f,multiplier=%.2f,mbsize=%d,epochs=%d,perm=%d' \
               % (self.hidden_layers, self.hidden_units, lr, 1 / lr, MINI_BATCH_SIZE, self.epochs, t)
""" This file maps the evaluation metrics for various splits in the K-fold space"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from classifiers import Classifier
from evaluation import evaluate
from dataloader import get_X_y
from IPython.display import HTML

""" for now KNeighbors will be used as it gave the highest accuracy """
model = Classifier()

columns = ['Accuracy %','Precision %','Recall','F1_Score']
df = pd.DataFrame(columns = ['K_Fold']+columns)

""" Cross Validation using K-Fold """
def KFold_validation(kf, split_no):
    X, y = get_X_y()
    # kf.get_n_splits(X)
    for train, test in kf.split(X,y):
        X_train, X_test = X.iloc[train], X.iloc[test]
        y_train, y_test = y[train], y[test]

        classifier = model.KNeighbors(X_train, y_train)
        accuracy, precision, recall,f_score, y_score = evaluate(classifier, X_test, y_test)
    
    return  accuracy*100, precision*100, recall,f_score

""" Evaluation metric for different K-folds are shown in tabular format """
def tabulate_kfold():
Exemple #20
0
from classifiers import Classifier
from langdetect import detect

languages = {'en': 'english', 'fa': 'persian', 'ar': 'persian'}

textfa = '''عالیه!!'''

texten = '''though excessively tiresome , the uncertainty principle , as verbally pretentious as the title may be , has its handful of redeeming features , as long as you discount its ability to bore .'''
lang = detect(textfa)

print(lang)

classifier = Classifier([1], ['+', '-', '='], languages[lang])
# first read dictionary and weights from cache if not read from file
# classifier.hash_dictionary[languages[lang]] = cache[lang]

classifier.load_word_dictionary()
classifier.load_model_npy('perceptron-100-UNIGRAMS-0.4-persian')
# print(classifier.hash_dictionary)
print(classifier.predict_one(textfa))

lang = detect(texten)

print(lang)

classifier = Classifier([1], ['+', '-'], languages[lang])
# first read dictionary and weights from cache if not read from file
# classifier.hash_dictionary[languages[lang]] = cache[lang]
classifier.load_word_dictionary()
classifier.load_model_npy(
    'perceptron-128-UNIGRAMS-0.6_test_size-0.01_random_state-0_shuffle-english'
Exemple #21
0
# Creating bag of words
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

# Spliting of dataset
X_train, X_test, y_train, y_test = Preprocessing.datasplit(X,
                                                           y,
                                                           test_size=0.1,
                                                           random_state=0)

# Classifiers
classifierRFC = Classifier.RFC(X_train,
                               y_train,
                               n_estimators=13,
                               criterion='entropy')
classifierkNN = Classifier.kNN(X_train,
                               y_train,
                               n_neighbors=8,
                               metric='minkowski')
classifierLR = Classifier.LR(X_train, y_train)
classifierGaussNB = Classifier.GaussNB(X_train, y_train)
classifierDTC = Classifier.DTC(X_train, y_train, criterion='entropy')
classifierSVM = Classifier.SuppVM(X_train, y_train, kernel='rbf')

# Prediction using the test set
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
""" Random Forest Classifier """
y_predRFC = classifierGaussNB.predict(X_test)
cm_RFC = confusion_matrix(y_test, y_predRFC)
Exemple #22
0
def train_keras_classifier():
    from cifar10_classifier import Classifier
    from keras.optimizers import SGD
    from keras import backend as K

    batch_size = 128
    epochs = 50
    data_augmentation = True

    opt = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)

    with tf.variable_scope('conv') as scope:
        model = Classifier().model
        model.compile(loss='categorical_crossentropy',
                      optimizer=opt,
                      metrics=['accuracy'])

    # load data and start training
    if data_augmentation:
        print('Using real-time data augmentation.')
        datagen, (x_train,
                  y_train), (x_test,
                             y_test) = data_loader.load_augmented_data()
        model.fit_generator(datagen.flow(x_train,
                                         y_train,
                                         batch_size=batch_size),
                            epochs=epochs,
                            validation_data=(x_test, y_test),
                            workers=4,
                            callbacks=[SGDLearningRateTracker()])
    else:
        print('Not using data augmentation.')
        (x_train, y_train), (x_test, y_test) = data_loader.load_original_data()
        model.fit(x_train,
                  y_train,
                  batch_size=batch_size,
                  epochs=epochs,
                  validation_data=(x_test, y_test),
                  shuffle=True)

    # save as tensorflow model
    if not os.path.isdir(SAVE_DIR):
        os.makedirs(SAVE_DIR)
    model.save(CLASSIFIER_PATH)

    from keras.backend import get_session
    sess = get_session()
    saver = tf.train.Saver()
    saver.save(sess, PRETRAINED_PATH)
    print('Saved trained model at %s ' % (PRETRAINED_PATH))

    # evaluate on test set
    scores = model.evaluate(x_test, y_test, verbose=1)
    print('Test loss:', scores[0])
 def __init__(self, configuration, controller, fitness):
     super(DummySurrogateModel, self).__init__(configuration,
                                                controller,
                                                fitness)
     self.regressor = Regressor(controller, configuration)
     self.classifier = Classifier()
Exemple #24
0
    def run_classifier(self):

        classifier_out_dir = self.clinvar_ml_out_dir + '/' + '_'.join(
            self.genomic_classes)
        if not os.path.exists(classifier_out_dir):
            os.makedirs(classifier_out_dir)

        # @anchor-1
        #out_models_dir = self.base_out_models_dir + '/' + 'intergenic_utr_lincrna_ucne_vista'  #+ '_'.join(self.genomic_classes)
        out_models_dir = self.base_out_models_dir + '/' + '_'.join(
            self.genomic_classes)
        """
		    -- JARVIS:
		       > "intergenic_utr_lincrna_ucne_vista" is the best model for 'utr'--D3000-struct (0.675)
		       > "intergenic_utr_lincrna_ucne_vista" is the best model for 'intergenic,utr'--D3000-struct (0.649)
		       > "ccds" is (probably) the best model for 'ccds'--D3000-struct (0.565)

		"""
        if not os.path.exists(out_models_dir):
            os.makedirs(out_models_dir)

        classifier = Classifier(
            self.Y_label,
            classifier_out_dir,
            out_models_dir,
            base_score=self.base_score,
            model_type=self.model_type,
            use_only_base_score=self.use_only_base_score,
            include_vcf_extracted_features=self.include_vcf_extracted_features,
            exclude_base_score=self.exclude_base_score,
            use_pathogenicity_trained_model=use_pathogenicity_trained_model,
            use_conservation_trained_model=use_conservation_trained_model,
            predict_on_test_set=predict_on_test_set)

        classifier.preprocess_data(self.df)
        #print(self.df.info())

        # --- Get correlations between features ---
        features_df = self.df.drop([
            'chr', 'start', 'end', 'clinvar_annot', 'common_variants',
            'common_vs_all_variants_ratio', 'all_variants', 'mean_ac',
            'mean_af', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'bin_5', 'bin_6'
        ],
                                   axis=1)
        #print(features_df.corr())

        fig, ax = plt.subplots(figsize=(19, 15))

        corr = features_df.corr()
        sns_plot = sns.heatmap(corr,
                               cmap=plt.cm.RdBu,
                               linecolor='white',
                               linewidths=0.1,
                               square=True,
                               ax=ax,
                               vmax=1.0,
                               vmin=-1.0)

        sns_plot.get_figure().savefig(self.clinvar_ml_out_dir +
                                      '/Feature-correlation_matrix.' +
                                      '_'.join(self.genomic_classes) + '.pdf',
                                      format='pdf',
                                      bbox_inches='tight')
        # ---------------------------------------

        classifier.init_model()

        classifier.run_classification_with_cv()

        self.score_print_name = classifier.score_print_name
        self.mean_tpr = classifier.mean_tpr
        self.mean_fpr = classifier.mean_fpr
        self.mean_auc = classifier.mean_auc

        self.metrics_list = classifier.metrics_list

        self.y_label_lists = classifier.y_label_lists
        self.y_proba_lists = classifier.y_proba_lists

        plt.close()
def cal_point_type(x_train, x_val, y_train, y_val):
    """
    Calculates the misclassified values of each classifier

    Parameters:
        x_train: array:like, shape(n_train_samples, n_features)
        x_val: array:like, shape(n_val_samples, n_features)
        y_train: of length n_train_samples
        y_val: of length n_val_samples

    Returns:
        outliers: array:like
        point_score: dictionary
        easy_points: array:like
    """
    # list of all classifiers
    classifiers = [
        "KNeighbors",
        "Random_Forest",
        "svm_classifier",
        "Gaussian",
        "Decision_Tree",
        "Logistic_Reg",
    ]
    model = Classifier()

    # create dictionay to store misclassified values for each classifier
    err_indexes = {}
    correct_indexes = {}
    wt = {}

    for clf in classifiers:
        train_clf = getattr(model, clf)(x_train, y_train)
        y_score = train_clf.predict(x_val)
        # get the indexes of misclassified values
        err_indexes[clf] = np.where(y_score != y_val)
        correct_indexes[clf] = np.where(y_score == y_val)
        # associate wt to each model, based on its accuracy
        acc = accuracy_score(y_val, y_score)
        wt[clf] = 1 / (1 - np.power(acc, 2))

    # calculating outliers
    outliers = err_indexes["KNeighbors"]

    for clf in classifiers:
        outliers = np.intersect1d(outliers, err_indexes[clf])

    # calculating points with trivial info : the misclassified points by model with high accuracy are a subset of misclassified points by models with less accuracy.
    # print('Points associated with each model :', wt)

    # correctly by k:nn but not by random:forest
    s1 = wt["KNeighbors"] - wt["Random_Forest"]
    pt1 = np.intersect1d(correct_indexes["KNeighbors"],
                         err_indexes["Random_Forest"])

    # correctly by random:forest not by decision:tree
    s2 = wt["Random_Forest"] - wt["Decision_Tree"]
    pt2 = np.intersect1d(correct_indexes["Random_Forest"],
                         err_indexes["Decision_Tree"])

    # correctly by decision:tree not by logistic regression
    s3 = wt["Decision_Tree"] - wt["Logistic_Reg"]
    pt3 = np.intersect1d(correct_indexes["Decision_Tree"],
                         err_indexes["Logistic_Reg"])

    # correctly by logistic regression not by Gaussian
    s4 = wt["Logistic_Reg"] - wt["Gaussian"]
    pt4 = np.intersect1d(correct_indexes["Logistic_Reg"],
                         err_indexes["Gaussian"])

    # correctly by Gaussian not by svm
    s5 = wt["Gaussian"] - wt["svm_classifier"]
    pt5 = np.intersect1d(correct_indexes["Gaussian"],
                         err_indexes["svm_classifier"])

    point_score = {
        "p1": (s1, pt1),
        "p2": (s2, pt2),
        "p3": (s3, pt3),
        "p4": (s4, pt4),
        "p5": (s5, pt5),
    }

    # calculating easy:points
    easy_points = correct_indexes["KNeighbors"]
    for clf in classifiers:
        easy_points = np.intersect1d(easy_points, correct_indexes[clf])

    return outliers, point_score, easy_points
Exemple #26
0
def run(config,     num_batches,      batch_size,
        model_name, class_model_name, ofile,
        threshold,  num_workers,      epochs,
        multi_gans, gan_weights,      trunc_norm,
        fixed_dset, transform,        filter_samples):

    # Instanciating generator
    config['G_batch_size'] = batch_size

    generator = GeneratorWrapper(config, model_name, trunc_norm, multi_gans, gan_weights)
    generator_fn = generator.gen_batch
    if gan_weights:
        print('Using GAN weights (multi-GAN setting): ', str(gan_weights))

    # Instanciating filtering classifier
    if filter_samples:
        print('Using ResNet20 weights: %s.pth' % class_model_name)
        filter_net = Classifier('resnet20', config['n_classes'])
        filter_net.load(class_model_name)
        filter_fn = filter_net.filter
    else:
        filter_fn = None

    # Creating a filtered loader using the classifier
    num_classes = config['n_classes']
    loader = FilteredLoader(generator_fn,
                            filter_fn,
                            num_classes,
                            num_batches,
                            batch_size,
                            threshold,
                            num_workers,
                            fixed_dset,
                            transform)

    print('Training using %d generated images per epoch'
          % loader.train_length())

    # Creating a blank ResNet
    net = resnet20(config['n_classes'], width=64).to('cuda')

    # Initializing loss functions, optimizer, learning rate scheduler
    cross_entropy = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.9, weight_decay=0.0001)
    scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[100, 150])

    # Evaluating the model on the test set
    test_loader = utils.make_test_loader(config['dataset'],
                                         batch_size,
                                         transforms.Normalize(*utils.norm_vals))

    # Training the model
    t1 = utils.ctime()
    best_acc = 0.0
    for epoch in range(epochs):
        print('Epoch: %3d' % (epoch+1), end="  ")

        train(net, loader, batch_size, optimizer, cross_entropy)
        scheduler.step()

        acc = evaluate(net, test_loader)
        best_acc = max(acc, best_acc)
        loader.reset()
        print('Val acc: %4.2f %% ' % evaluate(net, test_loader),
              ' | Best acc: %4.2f %%\n' % best_acc)

    tt = utils.ctime() - t1
    print('Finished training, total time: %4.2fs' % tt)
    print('Best accuracy achieved: %4.5f %%' % best_acc)

    # Saving output model
    output = './output/%s.pth' % ofile
    print('Saving trained classifier in %s' % output)
    torch.save(net.state_dict(), output)
Exemple #27
0
def main(argv):
    # fake_scores = fake_score_generator(death_dataset)
    # death_dataset = add_feature(death_dataset, fake_scores)
    # names = {}
    # names_dict = create_name_dict(names_list)
    # filename = 'death'
    # names_list = get_file_names(filename)

    count_flag = False
    gender_flag = False
    proximity_flag = False
    tf_idf_flag = False
    graph_flag = False
    run_file_flag = False
    ablation_flag = False
    best_subset_flag = False

    books_inverted_index = Index()
    # do a fresh indexing and save
    # books_inverted_index.add_all_books()
    # save_index(books_inverted_index)
    # or load from previous indexing

    if len(sys.argv) > 1:

        for arg in sys.argv[1:]:
            if arg == 'index_books':
                # do a fresh indexing and save
                books_inverted_index.add_all_books()

                save_index(books_inverted_index)

            elif arg == 'load_books':
                print(
                    "loading books directly as inverted_index object into the program"
                )
                books_inverted_index = load_index()

            elif arg == 'count_features':
                count_flag = True

            elif arg == 'gender_feature':
                gender_flag = True

            elif arg == 'proximity_feature':
                proximity_flag = True

            elif arg == 'tf_idf':
                tf_idf_flag = True

            elif arg == 'ablation':
                ablation_flag = True

            elif arg == 'best_subset':
                best_subset_flag = True

            elif arg == 'graph_feature':
                graph_flag = True
                graph = Graph()

            elif arg == 'all_features':
                count_flag = True
                gender_flag = True
                proximity_flag = True
                tf_idf_flag = True
                graph_flag = True
                graph = Graph()

            elif arg == 'run_file':
                run_file_flag = True

            elif arg == 'quick':
                books_inverted_index = load_index()
                count_flag = True
                gender_flag = True
                proximity_flag = True
                tf_idf_flag = False
                graph_flag = False
                graph = Graph()

            else:
                sys.exit("Wrong usage!")

    else:
        books_inverted_index = load_index()
        count_flag = True
        gender_flag = True
        proximity_flag = True
        tf_idf_flag = True
        graph_flag = True
        graph = Graph()

    classifier = Classifier()
    classifier.read_separate_train_test_files(evaluate=True)
    # classifier.split_data()

    # reading names for training and test sets
    training_names = classifier.get_names(training=True)
    test_names = classifier.get_names(test=True)

    # creating features for the training set
    features_index, training_features = create_features(
        training_names, books_inverted_index, graph, count_flag, gender_flag,
        proximity_flag, tf_idf_flag, graph_flag)
    # creating features for the test set
    features_index, test_features = create_features(
        test_names, books_inverted_index, graph, count_flag, gender_flag,
        proximity_flag, tf_idf_flag, graph_flag)

    classifier.set_features(training_features, test_features)
    classifier.save_features()

    y_pred_log = classifier.logistic_regression()
    # classifier.svc_polynomial()
    # classifier.svc_guassian_kernel()
    y_pred_svc = classifier.svc_sigmoid()
    y_pred_dt = classifier.decision_tree()
    y_pred_knn = classifier.k_nearest_neighbors()
    y_pred_nb = classifier.naive_base()

    # create the run file out of the knn's results
    if run_file_flag == True:
        classifier.make_new_run_file(y_pred_dt, 'dt')
        classifier.make_new_run_file(y_pred_log, 'logit')
        classifier.make_new_run_file(y_pred_svc, 'svc')
        classifier.make_new_run_file(y_pred_knn, 'knn')
        classifier.make_new_run_file(y_pred_nb, 'naive')

    # classifier.feature_selection()

    classifier.plot_f1_scores(classifier.method_name,
                              classifier.f_scores,
                              plot_title='Death Prediction',
                              file_name='f1_scores')

    y_pred_list = [y_pred_log, y_pred_svc, y_pred_dt, y_pred_knn, y_pred_nb]

    classifier.plot_with_error_bars('death', y_pred_list,
                                    classifier.method_name, 'Death Prediction',
                                    'death_fscore_error')

    if gender_flag:
        gender_training_features = training_features[2]
        gender_test_features = test_features[2]
        classifier.evaluate_gender_prediction(gender_training_features,
                                              gender_test_features,
                                              print_flag=True)

    if ablation_flag:
        ablation_test(classifier, features_index, training_features,
                      test_features)

    if best_subset_flag:
        best_subset_selection(classifier, training_features, test_features)
Exemple #28
0
from classifiers import Classifier
classifier = Classifier([1], ['+', '-', '='], 'persian')

directory = 'Persian_Comments/'

classifier.read_excel_file(directory, ['comment', 'orientation'])
classifier.apply_feature_set()  # [1,2], [3]
sets = classifier.train_test_split(0.005)
# sets = [(classifier.all_feature_sets, classifier.all_feature_sets[0])] #for extract model

classifier.set_patience(300)

# perceptron
k = 1
for train, test in sets:
    if '$' in classifier.selected_split_name:
        classifier.selected_split_name = str(
            k) + classifier.selected_split_name[1:]
        k += 1
    classifier.create_dictionary(train)
    classifier.save_word_dictionary()

    for lr in [0.1, 0.2, 0.25, 0.4, 0.6, 0.75, 0.8,
               1]:  # [0.1, 0.2, 0.25, 0.4, 0.6, 0.75, 0.8, 1]
        classifier.create_weights_array(1)
        print('{} \tLearning Rate= {}\tSplit Set: {}'.format(
            'perceptron', lr, classifier.selected_split_name))
        classifier.train(
            train,
            test,
            classifier='perceptron',
Exemple #29
0
def train_classifier():
    from classifiers import Classifier

    max_epoch = 50
    batch_size = 128
    imgsize = 32
    weight_decay = 0  # disabled
    num_classes = 10
    data_augmentation = False

    print('WARNING: data augmentation not implemented. ' + \
        'For better model performance, please use train_keras_classifier instead')
    response = raw_input('Do you wish to continue? (y/N)')
    if response.lower() not in ['y', 'yes']: return

    if data_augmentation:
        pass
    else:
        (x_train, y_train), (x_test, y_test) = data_loader.load_original_data()
    data_train = tf.data.Dataset.from_tensor_slices((x_train, y_train))
    data_trian = data_train.shuffle(50000).repeat().batch(128)
    iter_train = data_train.make_initializable_iterator()

    x = tf.placeholder(tf.float32, [batch_size, imgsize, imgsize, 3])
    y_ = tf.placeholder(tf.float32, [batch_size, num_classes])

    regularizer = tf.contrib.layers.l2_regularizer(scale=weight_decay)
    with tf.variable_scope('conv') as scope:
        model = Classifier(x, regularizer, expand_dim=False)
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits_v2(logits=model.logits,
                                                   labels=y_))
    reg_loss = tf.losses.get_regularization_loss()
    loss += reg_loss

    eval_acc = accuracy(model.logits, y_)

    optimizer = tf.train.MomentumOptimizer(learning_rate=0.01,
                                           momentum=0.9,
                                           use_nesterov=True)
    optim_step = optimizer.minimize(loss=loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(iter_train.initializer)

    next_batch = iter_train.get_next()
    for n_epoch in range(max_epoch):
        for i in range(50000 / batch_size):
            batch = sess.run(next_batch)
            _, acc_val, loss_val = sess.run([optim_step, eval_acc, loss],
                                            feed_dict={
                                                x: batch[0],
                                                y_: batch[1]
                                            })
            if i % 100 == 0:
                print("Epoch: %d, Step: %d, Acc: %f, Loss: %f" %
                      (n_epoch, i, acc_val, loss_val))
        acc_avg = loss_avg = 0
        test_batch_num = len(y_test) / batch_size

        # validate on test set
        for i in range(test_batch_num):
            acc_val, loss_val = sess.run(
                [eval_acc, loss],
                feed_dict={
                    x: x_test[i * batch_size:(i + 1) * batch_size],
                    y_: y_test[i * batch_size:(i + 1) * batch_size]
                })
            acc_avg += acc_val
            loss_avg += loss_val
        print('Test accuracy: %f, loss: %f' %
              (acc_avg / test_batch_num, loss_avg / test_batch_num))

    saver = tf.train.Saver()
    saver.save(sess, CLASSIFIER_PATH)
    print('Saved trained model at %s ' % CLASSIFIER_PATH)