Beispiel #1
0
def results(X_train, y_train, X_test, y_test, features="binary", D_in=200):

    print("\n  > Logistic Regression: ")
    # performs logistic regression
    log_reg = Classifier(X_train, y_train, model="log_reg")
    # determines the parameters used in the grid search
    hyperparams = {'C': [0.01, 1, 100], 'penalty': ['l1', 'l2']}
    # picks the best possible model using grid search
    log_reg.grid_search(hyperparams)
    # fully train the best model
    log_reg.fit()
    # tests the accuracy of the model
    log_reg.score(X_test, y_test)

    print("\n  > Linear SVM: ")
    # performs SVM
    Linear_SVM = Classifier(X_train, y_train, model="Linear_SVM")
    # determines the parameters used in the grid search
    hyperparams = {'C': [0.01, 1, 100]}
    # picks the best possible model using grid search
    Linear_SVM.grid_search(hyperparams)
    # fully train the best model
    Linear_SVM.fit()
    # tests the accuracy of the model
    Linear_SVM.score(X_test, y_test)

    if features == "binary":
        print("\n  > Bernoulli Naive Bayes SVM: ")
        # performs Gaussian Naive Bayes
        Bernoulli_NBSVM = Classifier(X_train, y_train, model="Bernoulli_NBSVM")
        # determines the parameters used in the grid search
        hyperparams = {'C': [0.01, 1, 100], 'beta': [0.25, 0.5, 0.75]}
        # picks the best possible model using grid search
        Bernoulli_NBSVM.grid_search(hyperparams)
        # fully train the best model
        Bernoulli_NBSVM.fit()
        # tests the accuracy of the model
        Bernoulli_NBSVM.score(X_test, y_test)

    if features == "sentence_embed":
        print("\n  > Feedforward NN:")
        # performs feeforward NN
        feedforward_NN = Classifier(X_train, y_train, "feedforward_NN", D_in)
        # determines the parameters used in the grid search
        #hyperparams = {'batch_size' : [128, 256, 512], 'epochs' : [10, 20, 50]}
        # picks the best possible model using grid search
        #feedforward_NN.grid_search(hyperparams)
        # fully train the best model
        feedforward_NN.fit()
        # tests the accuracy of the model
        feedforward_NN.score(X_test, y_test)

        print("\n  > Gaussian Naive Bayes: ")
        # performs Gaussian Naive Bayes
        Gaussian_NB = Classifier(X_train, y_train, model="Gaussian_NB")
        # determines the parameters used in the grid search
        hyperparams = {
            'priors': [None, (0.25, 0.75), (0.5, 0.5), (0.75, 0.25)]
        }
        # picks the best possible model using grid search
        Gaussian_NB.grid_search(hyperparams)
        # fully train the best model
        Gaussian_NB.fit()
        # tests the accuracy of the model
        Gaussian_NB.score(X_test, y_test)

        return (log_reg, Linear_SVM, Gaussian_NB)

    else:
        print("\n  > Multinomial Naive Bayes: ")
        # performs Gaussian Naive Bayes
        Multinomial_NB = Classifier(X_train, y_train, model="Multinomial_NB")
        # determines the parameters used in the grid search
        hyperparams = {
            'class_prior': [None, (0.25, 0.75), (0.5, 0.5), (0.75, 0.25)]
        }
        # picks the best possible model using grid search
        Multinomial_NB.grid_search(hyperparams)
        # fully train the best model
        Multinomial_NB.fit()
        # tests the accuracy of the model
        Multinomial_NB.score(X_test, y_test)

        return (log_reg, Linear_SVM, Multinomial_NB)
Beispiel #2
0
from classifiers import Classifier
from langdetect import detect

languages = {'en': 'english', 'fa': 'persian', 'ar': 'persian'}

textfa = '''عالیه!!'''

texten = '''though excessively tiresome , the uncertainty principle , as verbally pretentious as the title may be , has its handful of redeeming features , as long as you discount its ability to bore .'''
lang = detect(textfa)

print(lang)

classifier = Classifier([1], ['+', '-', '='], languages[lang])
# first read dictionary and weights from cache if not read from file
# classifier.hash_dictionary[languages[lang]] = cache[lang]

classifier.load_word_dictionary()
classifier.load_model_npy('perceptron-100-UNIGRAMS-0.4-persian')
# print(classifier.hash_dictionary)
print(classifier.predict_one(textfa))

lang = detect(texten)

print(lang)

classifier = Classifier([1], ['+', '-'], languages[lang])
# first read dictionary and weights from cache if not read from file
# classifier.hash_dictionary[languages[lang]] = cache[lang]
classifier.load_word_dictionary()
classifier.load_model_npy(
    'perceptron-128-UNIGRAMS-0.6_test_size-0.01_random_state-0_shuffle-english'
from classifiers import Classifier
classifier = Classifier([1], ['+', '-'], 'english')
# data = numpy.asarray([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]])

directory_of_file = 'polarityData/rt-polaritydata/rt-polarity-neg.txt'
classifier.read_text_file(directory_of_file, '-')
directory_of_file = 'polarityData/rt-polaritydata/rt-polarity-pos.txt'
classifier.read_text_file(directory_of_file, '+')
classifier.apply_feature_set()  # [1,2], [3]
sets = classifier.train_test_split(0.01)
# sets = [(classifier.all_feature_sets, sets[0][1])] # for extract model

classifier.set_patience(300)

# perceptron
k = 1
for train, test in sets:
    if '$' in classifier.selected_split_name:
        classifier.selected_split_name = str(
            k) + classifier.selected_split_name[1:]
        k += 1
    classifier.create_dictionary(train)
    classifier.save_word_dictionary()
    # print(classifier.hash_dictionary)
    for lr in [0.6]:  # [0.1, 0.2, 0.25, 0.4, 0.6, 0.75, 0.8, 1]
        classifier.create_weights_array(1)
        print('{} \tLearning Rate= {}\tSplit Set: {}'.format(
            'perceptron', lr, classifier.selected_split_name))
        classifier.train(
            train,
            test,
Beispiel #4
0
def train_classifier():
    from classifiers import Classifier

    max_epoch = 50
    batch_size = 128
    imgsize = 32
    weight_decay = 0  # disabled
    num_classes = 10
    data_augmentation = False

    print('WARNING: data augmentation not implemented. ' + \
        'For better model performance, please use train_keras_classifier instead')
    response = raw_input('Do you wish to continue? (y/N)')
    if response.lower() not in ['y', 'yes']: return

    if data_augmentation:
        pass
    else:
        (x_train, y_train), (x_test, y_test) = data_loader.load_original_data()
    data_train = tf.data.Dataset.from_tensor_slices((x_train, y_train))
    data_trian = data_train.shuffle(50000).repeat().batch(128)
    iter_train = data_train.make_initializable_iterator()

    x = tf.placeholder(tf.float32, [batch_size, imgsize, imgsize, 3])
    y_ = tf.placeholder(tf.float32, [batch_size, num_classes])

    regularizer = tf.contrib.layers.l2_regularizer(scale=weight_decay)
    with tf.variable_scope('conv') as scope:
        model = Classifier(x, regularizer, expand_dim=False)
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits_v2(logits=model.logits,
                                                   labels=y_))
    reg_loss = tf.losses.get_regularization_loss()
    loss += reg_loss

    eval_acc = accuracy(model.logits, y_)

    optimizer = tf.train.MomentumOptimizer(learning_rate=0.01,
                                           momentum=0.9,
                                           use_nesterov=True)
    optim_step = optimizer.minimize(loss=loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(iter_train.initializer)

    next_batch = iter_train.get_next()
    for n_epoch in range(max_epoch):
        for i in range(50000 / batch_size):
            batch = sess.run(next_batch)
            _, acc_val, loss_val = sess.run([optim_step, eval_acc, loss],
                                            feed_dict={
                                                x: batch[0],
                                                y_: batch[1]
                                            })
            if i % 100 == 0:
                print("Epoch: %d, Step: %d, Acc: %f, Loss: %f" %
                      (n_epoch, i, acc_val, loss_val))
        acc_avg = loss_avg = 0
        test_batch_num = len(y_test) / batch_size

        # validate on test set
        for i in range(test_batch_num):
            acc_val, loss_val = sess.run(
                [eval_acc, loss],
                feed_dict={
                    x: x_test[i * batch_size:(i + 1) * batch_size],
                    y_: y_test[i * batch_size:(i + 1) * batch_size]
                })
            acc_avg += acc_val
            loss_avg += loss_val
        print('Test accuracy: %f, loss: %f' %
              (acc_avg / test_batch_num, loss_avg / test_batch_num))

    saver = tf.train.Saver()
    saver.save(sess, CLASSIFIER_PATH)
    print('Saved trained model at %s ' % CLASSIFIER_PATH)
Beispiel #5
0
from classifiers import Classifier
classifier = Classifier([1], ['+', '-', '='], 'persian')

directory = 'Persian_Comments/'

classifier.read_excel_file(directory, ['comment', 'orientation'])
classifier.apply_feature_set()  # [1,2], [3]
sets = classifier.train_test_split(0.005)
# sets = [(classifier.all_feature_sets, classifier.all_feature_sets[0])] #for extract model

classifier.set_patience(300)

# perceptron
k = 1
for train, test in sets:
    if '$' in classifier.selected_split_name:
        classifier.selected_split_name = str(
            k) + classifier.selected_split_name[1:]
        k += 1
    classifier.create_dictionary(train)
    classifier.save_word_dictionary()

    for lr in [0.1, 0.2, 0.25, 0.4, 0.6, 0.75, 0.8,
               1]:  # [0.1, 0.2, 0.25, 0.4, 0.6, 0.75, 0.8, 1]
        classifier.create_weights_array(1)
        print('{} \tLearning Rate= {}\tSplit Set: {}'.format(
            'perceptron', lr, classifier.selected_split_name))
        classifier.train(
            train,
            test,
            classifier='perceptron',
def cal_point_type(x_train, x_val, y_train, y_val):
    """
    Calculates the misclassified values of each classifier

    Parameters:
        x_train: array:like, shape(n_train_samples, n_features)
        x_val: array:like, shape(n_val_samples, n_features)
        y_train: of length n_train_samples
        y_val: of length n_val_samples

    Returns:
        outliers: array:like
        point_score: dictionary
        easy_points: array:like
    """
    # list of all classifiers
    classifiers = [
        "KNeighbors",
        "Random_Forest",
        "svm_classifier",
        "Gaussian",
        "Decision_Tree",
        "Logistic_Reg",
    ]
    model = Classifier()

    # create dictionay to store misclassified values for each classifier
    err_indexes = {}
    correct_indexes = {}
    wt = {}

    for clf in classifiers:
        train_clf = getattr(model, clf)(x_train, y_train)
        y_score = train_clf.predict(x_val)
        # get the indexes of misclassified values
        err_indexes[clf] = np.where(y_score != y_val)
        correct_indexes[clf] = np.where(y_score == y_val)
        # associate wt to each model, based on its accuracy
        acc = accuracy_score(y_val, y_score)
        wt[clf] = 1 / (1 - np.power(acc, 2))

    # calculating outliers
    outliers = err_indexes["KNeighbors"]

    for clf in classifiers:
        outliers = np.intersect1d(outliers, err_indexes[clf])

    # calculating points with trivial info : the misclassified points by model with high accuracy are a subset of misclassified points by models with less accuracy.
    # print('Points associated with each model :', wt)

    # correctly by k:nn but not by random:forest
    s1 = wt["KNeighbors"] - wt["Random_Forest"]
    pt1 = np.intersect1d(correct_indexes["KNeighbors"],
                         err_indexes["Random_Forest"])

    # correctly by random:forest not by decision:tree
    s2 = wt["Random_Forest"] - wt["Decision_Tree"]
    pt2 = np.intersect1d(correct_indexes["Random_Forest"],
                         err_indexes["Decision_Tree"])

    # correctly by decision:tree not by logistic regression
    s3 = wt["Decision_Tree"] - wt["Logistic_Reg"]
    pt3 = np.intersect1d(correct_indexes["Decision_Tree"],
                         err_indexes["Logistic_Reg"])

    # correctly by logistic regression not by Gaussian
    s4 = wt["Logistic_Reg"] - wt["Gaussian"]
    pt4 = np.intersect1d(correct_indexes["Logistic_Reg"],
                         err_indexes["Gaussian"])

    # correctly by Gaussian not by svm
    s5 = wt["Gaussian"] - wt["svm_classifier"]
    pt5 = np.intersect1d(correct_indexes["Gaussian"],
                         err_indexes["svm_classifier"])

    point_score = {
        "p1": (s1, pt1),
        "p2": (s2, pt2),
        "p3": (s3, pt3),
        "p4": (s4, pt4),
        "p5": (s5, pt5),
    }

    # calculating easy:points
    easy_points = correct_indexes["KNeighbors"]
    for clf in classifiers:
        easy_points = np.intersect1d(easy_points, correct_indexes[clf])

    return outliers, point_score, easy_points
Beispiel #7
0
"""

from extract_graph_features import GraphFeatures
from preprocessing import Preprocess
from sample import Sample
from classifiers import Classifier
from modelEvaluation import ModelEvaluation
from preprocessed_graph import PreprocessGraph

#Create the objects
gf = GraphFeatures()
pp = Preprocess()
ppg = PreprocessGraph()
sample = Sample()
classifier = Classifier()
me = ModelEvaluation()

#Only for one time
#gf.extractGraphFeatures()

choice = int(
    input(
        "Enter 0 for data without graph features and 1 for graph features: "))
#Preprocess the data
if choice == 0:
    x_train, x_test, y_train, y_test = pp.preprocessed_data()
else:
    print("Extracting graph features: ")
    x_train, x_test, y_train, y_test = ppg.scale_data()
    print(x_train.head())
Beispiel #8
0
def main(argv):
    # fake_scores = fake_score_generator(death_dataset)
    # death_dataset = add_feature(death_dataset, fake_scores)
    # names = {}
    # names_dict = create_name_dict(names_list)
    # filename = 'death'
    # names_list = get_file_names(filename)

    count_flag = False
    gender_flag = False
    proximity_flag = False
    tf_idf_flag = False
    graph_flag = False
    run_file_flag = False
    ablation_flag = False
    best_subset_flag = False

    books_inverted_index = Index()
    # do a fresh indexing and save
    # books_inverted_index.add_all_books()
    # save_index(books_inverted_index)
    # or load from previous indexing

    if len(sys.argv) > 1:

        for arg in sys.argv[1:]:
            if arg == 'index_books':
                # do a fresh indexing and save
                books_inverted_index.add_all_books()

                save_index(books_inverted_index)

            elif arg == 'load_books':
                print(
                    "loading books directly as inverted_index object into the program"
                )
                books_inverted_index = load_index()

            elif arg == 'count_features':
                count_flag = True

            elif arg == 'gender_feature':
                gender_flag = True

            elif arg == 'proximity_feature':
                proximity_flag = True

            elif arg == 'tf_idf':
                tf_idf_flag = True

            elif arg == 'ablation':
                ablation_flag = True

            elif arg == 'best_subset':
                best_subset_flag = True

            elif arg == 'graph_feature':
                graph_flag = True
                graph = Graph()

            elif arg == 'all_features':
                count_flag = True
                gender_flag = True
                proximity_flag = True
                tf_idf_flag = True
                graph_flag = True
                graph = Graph()

            elif arg == 'run_file':
                run_file_flag = True

            elif arg == 'quick':
                books_inverted_index = load_index()
                count_flag = True
                gender_flag = True
                proximity_flag = True
                tf_idf_flag = False
                graph_flag = False
                graph = Graph()

            else:
                sys.exit("Wrong usage!")

    else:
        books_inverted_index = load_index()
        count_flag = True
        gender_flag = True
        proximity_flag = True
        tf_idf_flag = True
        graph_flag = True
        graph = Graph()

    classifier = Classifier()
    classifier.read_separate_train_test_files(evaluate=True)
    # classifier.split_data()

    # reading names for training and test sets
    training_names = classifier.get_names(training=True)
    test_names = classifier.get_names(test=True)

    # creating features for the training set
    features_index, training_features = create_features(
        training_names, books_inverted_index, graph, count_flag, gender_flag,
        proximity_flag, tf_idf_flag, graph_flag)
    # creating features for the test set
    features_index, test_features = create_features(
        test_names, books_inverted_index, graph, count_flag, gender_flag,
        proximity_flag, tf_idf_flag, graph_flag)

    classifier.set_features(training_features, test_features)
    classifier.save_features()

    y_pred_log = classifier.logistic_regression()
    # classifier.svc_polynomial()
    # classifier.svc_guassian_kernel()
    y_pred_svc = classifier.svc_sigmoid()
    y_pred_dt = classifier.decision_tree()
    y_pred_knn = classifier.k_nearest_neighbors()
    y_pred_nb = classifier.naive_base()

    # create the run file out of the knn's results
    if run_file_flag == True:
        classifier.make_new_run_file(y_pred_dt, 'dt')
        classifier.make_new_run_file(y_pred_log, 'logit')
        classifier.make_new_run_file(y_pred_svc, 'svc')
        classifier.make_new_run_file(y_pred_knn, 'knn')
        classifier.make_new_run_file(y_pred_nb, 'naive')

    # classifier.feature_selection()

    classifier.plot_f1_scores(classifier.method_name,
                              classifier.f_scores,
                              plot_title='Death Prediction',
                              file_name='f1_scores')

    y_pred_list = [y_pred_log, y_pred_svc, y_pred_dt, y_pred_knn, y_pred_nb]

    classifier.plot_with_error_bars('death', y_pred_list,
                                    classifier.method_name, 'Death Prediction',
                                    'death_fscore_error')

    if gender_flag:
        gender_training_features = training_features[2]
        gender_test_features = test_features[2]
        classifier.evaluate_gender_prediction(gender_training_features,
                                              gender_test_features,
                                              print_flag=True)

    if ablation_flag:
        ablation_test(classifier, features_index, training_features,
                      test_features)

    if best_subset_flag:
        best_subset_selection(classifier, training_features, test_features)
Beispiel #9
0
def run(config,     num_batches,      batch_size,
        model_name, class_model_name, ofile,
        threshold,  num_workers,      epochs,
        multi_gans, gan_weights,      trunc_norm,
        fixed_dset, transform,        filter_samples):

    # Instanciating generator
    config['G_batch_size'] = batch_size

    generator = GeneratorWrapper(config, model_name, trunc_norm, multi_gans, gan_weights)
    generator_fn = generator.gen_batch
    if gan_weights:
        print('Using GAN weights (multi-GAN setting): ', str(gan_weights))

    # Instanciating filtering classifier
    if filter_samples:
        print('Using ResNet20 weights: %s.pth' % class_model_name)
        filter_net = Classifier('resnet20', config['n_classes'])
        filter_net.load(class_model_name)
        filter_fn = filter_net.filter
    else:
        filter_fn = None

    # Creating a filtered loader using the classifier
    num_classes = config['n_classes']
    loader = FilteredLoader(generator_fn,
                            filter_fn,
                            num_classes,
                            num_batches,
                            batch_size,
                            threshold,
                            num_workers,
                            fixed_dset,
                            transform)

    print('Training using %d generated images per epoch'
          % loader.train_length())

    # Creating a blank ResNet
    net = resnet20(config['n_classes'], width=64).to('cuda')

    # Initializing loss functions, optimizer, learning rate scheduler
    cross_entropy = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.9, weight_decay=0.0001)
    scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[100, 150])

    # Evaluating the model on the test set
    test_loader = utils.make_test_loader(config['dataset'],
                                         batch_size,
                                         transforms.Normalize(*utils.norm_vals))

    # Training the model
    t1 = utils.ctime()
    best_acc = 0.0
    for epoch in range(epochs):
        print('Epoch: %3d' % (epoch+1), end="  ")

        train(net, loader, batch_size, optimizer, cross_entropy)
        scheduler.step()

        acc = evaluate(net, test_loader)
        best_acc = max(acc, best_acc)
        loader.reset()
        print('Val acc: %4.2f %% ' % evaluate(net, test_loader),
              ' | Best acc: %4.2f %%\n' % best_acc)

    tt = utils.ctime() - t1
    print('Finished training, total time: %4.2fs' % tt)
    print('Best accuracy achieved: %4.5f %%' % best_acc)

    # Saving output model
    output = './output/%s.pth' % ofile
    print('Saving trained classifier in %s' % output)
    torch.save(net.state_dict(), output)
""" This file maps the evaluation metrics for various splits in the K-fold space"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from classifiers import Classifier
from evaluation import evaluate
from dataloader import get_X_y
from IPython.display import HTML

""" for now KNeighbors will be used as it gave the highest accuracy """
model = Classifier()

columns = ['Accuracy %','Precision %','Recall','F1_Score']
df = pd.DataFrame(columns = ['K_Fold']+columns)

""" Cross Validation using K-Fold """
def KFold_validation(kf, split_no):
    X, y = get_X_y()
    # kf.get_n_splits(X)
    for train, test in kf.split(X,y):
        X_train, X_test = X.iloc[train], X.iloc[test]
        y_train, y_test = y[train], y[test]

        classifier = model.KNeighbors(X_train, y_train)
        accuracy, precision, recall,f_score, y_score = evaluate(classifier, X_test, y_test)
    
    return  accuracy*100, precision*100, recall,f_score

""" Evaluation metric for different K-folds are shown in tabular format """
def tabulate_kfold():