def main(train_path, test_path):

    train_x, train_t = util.load_dataset(train_path, label_col='class')
    test_x, test_t = util.load_dataset(test_path, label_col='class')
    train_x_inter = util.add_intercept(train_x)
    test_x_inter = util.add_intercept(test_x)
    #print(test_x_inter)
    classifier_t = LogisticRegression(max_iter=10000,
                                      step_size=0.0001,
                                      verbose=False)
    classifier_t.fit(train_x_inter, train_t)

    pred_t_prob = classifier_t.predict(test_x_inter)
    for i in range(len(test_t)):
        print(f"Predicted value: {pred_t_prob[i]} True Value: {test_t[i]}")
Example #2
0
def report_average_cost(model, criterion, optimizer, list_seq_num, list_loss,
                        list_cost, config_obj):
    list_avg_loss = []
    list_avg_cost = []
    list_T_num = []
    for T in range(10, 110, 10):
        print("Evaluating {0} model on sequence size {1}".format(
            config['model_type'], T))
        config_obj.config_dict['num_batches'] = 1
        config_obj.config_dict['batch_size'] = 1
        seqs_loader = utility.load_dataset(config_obj, max=T, min=T)
        if config['model_type'] == 'LSTM':
            avg_loss, avg_cost = evaluate_lstm(model, criterion, optimizer,
                                               seqs_loader, config)
        else:
            avg_loss, avg_cost = evaluate_ntm(model, criterion, optimizer,
                                              seqs_loader, config)
        list_avg_loss.append(avg_loss)
        list_avg_cost.append(avg_cost)
        list_T_num.append(T)
    saveCheckpoint(model,
                   list_T_num,
                   list_avg_loss,
                   list_avg_cost,
                   path="{0}_Ts".format(config['filename']))
    model, list_T_num, list_avg_loss, list_avg_cost = loadCheckpoint(
        path="{0}_Ts".format(config['filename']))
    plt.plot(list_T_num, list_avg_cost)
    plt.xlabel('T')
    plt.ylabel('average cost')
    plt.savefig('{0}_average_cost.pdf'.format(config['filename']))
Example #3
0
def question_3():
    logger.info(
        "EXECUTING: QUESTION 3 - Recommendation Systems with threshold limits")
    data, R_mat, _ = utility.load_dataset()  # load the dataset
    utility.perform_cross_validation(
        data, R_mat,
        threshold=True)  # perform cross validation with threshold limit
def question_4():
    logger.info("EXECUTING: QUESTION 4 - Recommendation Systems with Regularization")
    data, R_mat, W_mat = utility.load_dataset()
    logger.info("R & W Matrix - Interchanged")
    utility.compute_least_squared_error(W_mat, R_mat)  # interchange R & W matrix

    logger.info("R & W Matrix - Regularization")
    utility.compute_least_squared_error(R_mat, W_mat,regularized=True)
def question_c():
    logger.info("EXECUTING: QUESTION C")
    # get training data for every category and terms with their frequency for every class
    all_categories = train_all_dataset.target_names

    freq_words_all_categories=[]
    words_all_categories =[]
    all_data_category = []
    words_in_classes = defaultdict(list)

    find_for_classes_list = [train_all_dataset.target_names.index("comp.sys.ibm.pc.hardware"),
                             train_all_dataset.target_names.index("comp.sys.mac.hardware"),
                             train_all_dataset.target_names.index("misc.forsale"),
                             train_all_dataset.target_names.index("soc.religion.christian")]

    logger.info("Collecting data for each category")

    for category in all_categories:
        train_category = utility.load_dataset([category])[0]
        data_category = train_category.data
        temp = ''
        for document in data_category:
            temp += ' ' + document
        all_data_category.append(temp)

    logger.info("Cleaning Data and Forming Frequency List for each Class")

    # pre-process all the docs
    for data,pos in zip(all_data_category,range(len(all_data_category))):
        logger.info("Forming Frequency List for Class: {}".format(train_all_dataset.target_names[pos]))
        processed_data = utility.preprocess_data(data)
        count = Counter(processed_data)
        freq_words_all_categories.append(count)
        unique_words = set(processed_data)
        words_all_categories.append(list(unique_words))
        for word in unique_words:
            words_in_classes[word].append(train_all_dataset.target_names[pos])

    # calculating tf-icf
    for category in find_for_classes_list:
        logger.info("Fetching top 10 significant terms for class: {}".format(train_all_dataset.target_names[category]))
        terms_of_class = words_all_categories[category]
        freq_of_all_terms = freq_words_all_categories[category]
        number_of_terms = len(terms_of_class)
        tficf = {}
        for each_term in range(number_of_terms):
            term= terms_of_class[each_term] # term for which we are finding tf-icf
            frequency_of_term = freq_of_all_terms.get(term)
            number_of_class_with_term = len(words_in_classes[term]) # number of classes with term t
            # tficf for term t
            calc = 0.5 + ((0.5 * frequency_of_term/number_of_terms) * math.log(len(train_all_dataset.target_names) / number_of_class_with_term))
            tficf[term]=calc

        # print top 10 significant term for this class
        significant_terms = dict(sorted(tficf.iteritems(), key=operator.itemgetter(1), reverse=True)[:10]) #get 10 significant terms

        logger.info(significant_terms.keys())
Example #6
0
def visualize_read_write(model, criterion, optimizer, config_obj):
    T = 10
    config_obj.config_dict['num_batches'] = 20
    config_obj.config_dict['batch_size'] = 1
    seqs_loader = utility.load_dataset(config_obj, max=T, min=T)

    for batch_num, X, Y, act in seqs_loader:
        result = evaluate_single_batch(model, criterion, X, Y)
        plot_visualization(X, result, model.N)
Example #7
0
def question_4():
    logger.info(
        "EXECUTING: QUESTION 4 - Recommendation Systems with Regularization")
    data, R_mat, W_mat = utility.load_dataset()
    logger.info("R & W Matrix - Interchanged")
    utility.compute_least_squared_error(W_mat,
                                        R_mat)  # interchange R & W matrix

    logger.info("R & W Matrix - Regularization")
    utility.compute_least_squared_error(R_mat, W_mat, regularized=True)
def question_i():
    logger.info("EXECUTING: QUESTION I")
    logger.info("Multi-Class Classification")

    category = ['comp.sys.ibm.pc.hardware','comp.sys.mac.hardware','misc.forsale','soc.religion.christian']
    train, test = utility.load_dataset(category)

    logger.info("Processing Training Dataset")
    for data,pos in zip(train.data,range(len(train.data))):
        processedData = utility.preprocess_data(data)
        train.data[pos] = ' '.join(processedData)

    logger.info("Processing Testing Dataset")
    for data,pos in zip(test.data,range(len(test.data))):
        processedData = utility.preprocess_data(data)
        test.data[pos] = ' '.join(processedData)

    logger.info("Creating TFxIDF Vector Representations")

    stop_words = text.ENGLISH_STOP_WORDS  # omit stop words

    # using CountVectorizer and TFxIDF Transformer
    count_vect = CountVectorizer(stop_words=stop_words, lowercase=True)
    train_counts = count_vect.fit_transform(train.data)
    test_counts = count_vect.transform(test.data)
    tfidf_transformer = TfidfTransformer(norm='l2', sublinear_tf=True)
    train_idf = tfidf_transformer.fit_transform(train_counts)
    test_idf = tfidf_transformer.transform(test_counts)

    logger.info("Performing LSI on TFxIDF Matrices")
    # apply LSI to TDxIDF matrices
    svd = TruncatedSVD(n_components=50)
    train_lsi = svd.fit_transform(train_idf)
    test_lsi = svd.transform(test_idf)

    logger.info("TFxIDF Matrices Transformed")

    logger.info("Size of Transformed Training Dataset: {0}".format(train_lsi.shape))
    logger.info("Size of Transformed Testing Dataset: {0}".format(test_lsi.shape))

    clf_list = [OneVsOneClassifier(GaussianNB()), OneVsOneClassifier(svm.SVC(kernel='linear')), OneVsRestClassifier(GaussianNB()), OneVsRestClassifier(svm.SVC(kernel='linear'))]
    clf_name = ['OneVsOneClassifier Naive Bayes', 'OneVsOneClassifier SVM','OneVsRestClassifier Naive Bayes', 'OneVsRestClassifier SVM']

    # perform classification
    for clf,clf_n in zip(clf_list,clf_name):
        logger.info("Training {0} Classifier ".format(clf_n))
        clf.fit(train_lsi, train.target)
        logger.info("Testing {0} Classifier".format(clf_n))
        test_predicted = clf.predict(test_lsi)
        utility.calculate_statistics(test.target, test_predicted)
Example #9
0
def train_model(model, config, criterion, optimizer):
    losses = 0
    iter = 0
    # record the performance for this epoch
    train_data, valid_data, test_data = utility.load_dataset("mnist.pkl", config)
    record_performance(train_data, model, criterion, "train")
    record_performance(valid_data, model, criterion, "valid")
    record_performance(test_data, model, criterion, "test")
    for epoch in range(config.num_epochs):
        # iterate over batches
        # the shape of train_data[0] must be 500 x 100 x 784
        # the shape of train_data[1] must be 500 x 100
        for i in range(train_data[0].shape[0]):
            optimizer.zero_grad()
            x = Variable(torch.from_numpy(train_data[0][i]))
            y = Variable(torch.from_numpy(train_data[1][i]))
            optimizer.zero_grad()
            # compute loss
            loss = criterion(model(x), y)
            # compute gradients and update parameters
            loss.backward()
            # take one SGD step
            optimizer.step()
        # record the performance for this epoch
        train_loss, train_acc = record_performance(train_data, model, criterion, "train")
        valid_loss, valid_acc = record_performance(valid_data, model, criterion, "valid")
        test_loss, test_acc = record_performance(test_data, model, criterion, "test")
        # print the results for this epoch
        #print("Epoch {0} \nLoss : {1:.3f} \nAcc : {2:.3f}".format(epoch, train_loss, train_acc))
    # print the results for this epoch
    if int(config.filename.split("Q1_")[1]) < 6:
        print("Validation Results:\nLoss : {0:.3f} Acc : {1:.3f}".format(valid_loss, valid_acc))
        data_to_plot = (records["train"], records["valid"])
    #    data_to_plot = (records["train"])
        utility.plot_sample_data(data_to_plot, config, "Training Loss using {0} initialization".format(config.init_type), True)
    #    utility.plot_sample_data(data_to_plot, config, "Training Loss using {0} initialization".format(config.init_type), True, True)
    else:

        max_indx = np.argmax(list(records["valid"][1]))
        train_acc_data = list(records["train"][1])[max_indx]
        test_acc_data = list(records["test"][1])[max_indx]
        print("Best validation set was at epoch {0}".format(max_indx))
        print("Generalization Gap : {0} - {1} = {2}".format(train_acc_data, test_acc_data, train_acc_data-test_acc_data))
Example #10
0
def create_labeled_data():
    """
    Creates the labeled data set.
    First we load the unlabeled data with utility.py
    Then applying labeling and transformation functions.
    """
    df_train, sentences_number = utility.load_dataset()

    df_dev = pd.read_csv(r'data\dev_22.12.csv')
    df_train.to_csv(r'data\df_train.csv', index=False)

    df_train_labeled = apply_lf_on_data(df_train,df_dev,sentences_number)
    df_train_labeled.to_csv(r'data\labeled_data.csv', index=False)

    augmented = apply_tf_on_data(df_train_labeled)

    # Splitting to test and train:
    df_train_augmented, df_test = train_test_split(augmented, test_size=TEST_RATIO)
    df_test.to_csv(r'data\df_test.csv', index=False)
    df_train_augmented.to_csv(r'data\labeled_data_augmented.csv', index=False)

    return df_train_labeled,df_train_augmented,df_test
Example #11
0
def main(args):
    print >> sys.stderr, "Running Autumn NER model training module"
    print >> sys.stderr, args
    random.seed(args.seed)

    trainset = []
    devset = []

    print >> sys.stderr, "Loading dataset.."
    assert (os.path.isdir(args.datapath))

    word_vocab = []
    for fname in sorted(os.listdir(args.datapath)):
        if os.path.isdir(fname):
            continue

        if fname.endswith('train.ner.txt') or fname.endswith('dev.ner.txt'):
            dataset, vocab = load_dataset(os.path.join(args.datapath, fname))
            word_vocab += vocab

            if fname.endswith('train.ner.txt'):
                trainset += dataset
            if fname.endswith('dev.ner.txt'):
                devset += dataset

            print >> sys.stderr, "Loaded {} instances with a vocab size of {} from {}".format(
                len(dataset), len(vocab), fname)

    if args.embeddings_path:
        embeddings = load_embeddings(args.embeddings_path, word_vocab, 300)
    else:
        embeddings = None

    print "Loaded {} instances from data set".format(len(trainset))

    random.shuffle(trainset)

    X_train, y_train = zip(*trainset)
    X_dev, y_dev = zip(*devset)
    print "Training on {}, tuning on {}".format(len(X_train), len(X_dev))

    labels = []
    for lb in y_train + y_dev:
        labels += lb

    if os.path.exists('./saved_model'):
        os.rename('./saved_model',
                  './scratch/saved_model_{}'.format(time.time()))

    os.mkdir('./saved_model')

    word_vocab = sorted(set(word_vocab))
    with open(os.path.join('saved_model', 'word_vocab.pickle'), 'w') as f:
        pickle.dump(word_vocab, f)

    labels = sorted(set(labels))
    with open(os.path.join('saved_model', 'label_space.pickle'), 'w') as f:
        pickle.dump(labels, f)

    # Create the model, passing in relevant parameters
    bilstm = AutumnNER(labels=labels,
                       word_vocab=word_vocab,
                       word_embeddings=embeddings,
                       optimizer=args.optimizer,
                       embedding_size=300,
                       char_embedding_size=32,
                       lstm_dim=200,
                       num_cores=args.num_cores,
                       embedding_factor=args.embedding_factor,
                       learning_rate=args.learning_rate,
                       decay_rate=args.decay_rate,
                       dropout_keep=args.keep_prob)

    if not os.path.exists('./scratch'):
        os.mkdir('./scratch')

    print "Training.."
    bilstm.fit(X_train,
               y_train,
               X_dev,
               y_dev,
               num_epoch=args.num_epoch,
               batch_size=args.batch_size,
               seed=args.seed)

    bilstm.save('./saved_model/main')
    print "Training complete"

    print "Reporting performance on devset.."
    report_performance(bilstm, X_dev, y_dev, 'saved_model/devset_outcome.txt')
Example #12
0
if __name__ == '__main__':
    # pdb.set_trace()
    args = utility.parse_args()
    config_type = args['configtype']
    config_file = args['configfile']
    load_checkpoint = args['load_checkpoint']
    plot_all_average_flag = args['plot_all_average']
    visualize_read_write_flag = args['visualize_read_write']
    if plot_all_average_flag:
        plot_all_average_costs()
    else:
        config_obj = config.Configuration(config_type, config_file)
        config = config_obj.config_dict
        model, criterion, optimizer = models.build_model(config)
        seqs_loader = utility.load_dataset(config_obj)
        if visualize_read_write_flag:
            model, list_seq_num, list_loss, list_cost = loadCheckpoint(
                path=config['filename'])
            visualize_read_write(model, criterion, optimizer, config_obj)
        else:
            if not load_checkpoint:
                str_info = "{0}number of parameters: {1}\n".format(
                    config_obj.get_config_str(), model.calculate_num_params())
                print(str_info)
            if config['model_type'] == "LSTM":
                if not load_checkpoint:
                    list_seq_num, list_loss, list_cost = train_lstm_model(
                        config, model, criterion, optimizer, seqs_loader)
                    report_result(model, criterion, optimizer, list_seq_num,
                                  list_loss, list_cost, config_obj,
def question_2():
    logger.info("EXECUTING: QUESTION 2 - 10-fold Cross-Validation on Recommendation System")
    data, R_mat, _ = utility.load_dataset()  # load the dataset
    utility.perform_cross_validation(data, R_mat)  # perform cross validation
def question_i():
    logger.info("EXECUTING: QUESTION I")
    logger.info("Multi-Class Classification")

    category = [
        'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'misc.forsale',
        'soc.religion.christian'
    ]
    train, test = utility.load_dataset(category)

    logger.info("Processing Training Dataset")
    for data, pos in zip(train.data, range(len(train.data))):
        processedData = utility.preprocess_data(data)
        train.data[pos] = ' '.join(processedData)

    logger.info("Processing Testing Dataset")
    for data, pos in zip(test.data, range(len(test.data))):
        processedData = utility.preprocess_data(data)
        test.data[pos] = ' '.join(processedData)

    logger.info("Creating TFxIDF Vector Representations")

    stop_words = text.ENGLISH_STOP_WORDS  # omit stop words

    # using CountVectorizer and TFxIDF Transformer
    count_vect = CountVectorizer(stop_words=stop_words, lowercase=True)
    train_counts = count_vect.fit_transform(train.data)
    test_counts = count_vect.transform(test.data)
    tfidf_transformer = TfidfTransformer(norm='l2', sublinear_tf=True)
    train_idf = tfidf_transformer.fit_transform(train_counts)
    test_idf = tfidf_transformer.transform(test_counts)

    logger.info("Performing LSI on TFxIDF Matrices")
    # apply LSI to TDxIDF matrices
    svd = TruncatedSVD(n_components=50)
    train_lsi = svd.fit_transform(train_idf)
    test_lsi = svd.transform(test_idf)

    logger.info("TFxIDF Matrices Transformed")

    logger.info("Size of Transformed Training Dataset: {0}".format(
        train_lsi.shape))
    logger.info("Size of Transformed Testing Dataset: {0}".format(
        test_lsi.shape))

    clf_list = [
        OneVsOneClassifier(GaussianNB()),
        OneVsOneClassifier(svm.SVC(kernel='linear')),
        OneVsRestClassifier(GaussianNB()),
        OneVsRestClassifier(svm.SVC(kernel='linear'))
    ]
    clf_name = [
        'OneVsOneClassifier Naive Bayes', 'OneVsOneClassifier SVM',
        'OneVsRestClassifier Naive Bayes', 'OneVsRestClassifier SVM'
    ]

    # perform classification
    for clf, clf_n in zip(clf_list, clf_name):
        logger.info("Training {0} Classifier ".format(clf_n))
        clf.fit(train_lsi, train.target)
        logger.info("Testing {0} Classifier".format(clf_n))
        test_predicted = clf.predict(test_lsi)
        utility.calculate_statistics(test.target, test_predicted)
def question_c():
    logger.info("EXECUTING: QUESTION C")
    # get training data for every category and terms with their frequency for every class
    all_categories = train_all_dataset.target_names

    freq_words_all_categories = []
    words_all_categories = []
    all_data_category = []
    words_in_classes = defaultdict(list)

    find_for_classes_list = [
        train_all_dataset.target_names.index("comp.sys.ibm.pc.hardware"),
        train_all_dataset.target_names.index("comp.sys.mac.hardware"),
        train_all_dataset.target_names.index("misc.forsale"),
        train_all_dataset.target_names.index("soc.religion.christian")
    ]

    logger.info("Collecting data for each category")

    for category in all_categories:
        train_category = utility.load_dataset([category])[0]
        data_category = train_category.data
        temp = ''
        for document in data_category:
            temp += ' ' + document
        all_data_category.append(temp)

    logger.info("Cleaning Data and Forming Frequency List for each Class")

    # pre-process all the docs
    for data, pos in zip(all_data_category, range(len(all_data_category))):
        logger.info("Forming Frequency List for Class: {}".format(
            train_all_dataset.target_names[pos]))
        processed_data = utility.preprocess_data(data)
        count = Counter(processed_data)
        freq_words_all_categories.append(count)
        unique_words = set(processed_data)
        words_all_categories.append(list(unique_words))
        for word in unique_words:
            words_in_classes[word].append(train_all_dataset.target_names[pos])

    # calculating tf-icf
    for category in find_for_classes_list:
        logger.info("Fetching top 10 significant terms for class: {}".format(
            train_all_dataset.target_names[category]))
        terms_of_class = words_all_categories[category]
        freq_of_all_terms = freq_words_all_categories[category]
        number_of_terms = len(terms_of_class)
        tficf = {}
        for each_term in range(number_of_terms):
            term = terms_of_class[
                each_term]  # term for which we are finding tf-icf
            frequency_of_term = freq_of_all_terms.get(term)
            number_of_class_with_term = len(
                words_in_classes[term])  # number of classes with term t
            # tficf for term t
            calc = 0.5 + (
                (0.5 * frequency_of_term / number_of_terms) * math.log(
                    len(train_all_dataset.target_names) /
                    number_of_class_with_term))
            tficf[term] = calc

        # print top 10 significant term for this class
        significant_terms = dict(
            sorted(tficf.iteritems(), key=operator.itemgetter(1),
                   reverse=True)[:10])  #get 10 significant terms

        logger.info(significant_terms.keys())
def question_1():
    logger.info("EXECUTING: QUESTION 1 - Least Square Factorization")
    data, R_mat, W_mat = utility.load_dataset()  # load the dataset
    utility.compute_least_squared_error(R_mat, W_mat)  # compute the least squared error without regularization
            plt.subplot(n, n, j + 1)
            feature = subset[subset.columns[j]]

            if y is None:   # Plot the feature distribution (full)
                sns.distplot(feature)
            else:   # Plot feature distrbituions separated by class
                for label in labels:
                    feature_label = feature[y == label]
                    sns.distplot(feature_label, label=label, hist_kws={"alpha": 0.4})
                plt.legend()


if __name__ == "__main__":
    utility.setup_logging(params['results_dir'])

    data_df = utility.load_dataset(params['data_file'])

    X_all = data_df.drop(columns=['Series', 'Class'], errors='ignore')
    y_all = data_df['Class']

    plot_class_distribution(y_all)

    X_pca_2d = do_pca(X_all, num_components=2)

    plot_dataset_2d(X_pca_2d, y_all)

    plot_correlation_matrix(X_all)

    plot_feature_distributions_nxn_grid(X_all, n=3)

    plot_feature_distributions_nxn_grid(X_all, y_all, n=3)
Example #18
0
# -*- coding: utf-8 -*-
"""
Created on Thu Mar  1 11:41:39 2018

@author: jaydeep thik
"""

import tensorflow as tf
import numpy as np
import math
import h5py
import matplotlib.pyplot as plt
from PIL import Image
from utility import load_dataset, encode_one_hot

X_train, X_test, y_train, y_test, classes = load_dataset()

#plt.imshow(X_train[0])
#print(y_train[:,0])

X_train, X_test = X_train / 255., X_test / 255.
y_train = encode_one_hot(y_train, len(classes))
y_test = encode_one_hot(y_test, len(classes))


def random_mini_batches(X, Y, mini_batch_size=64, seed=0):

    m = X.shape[0]
    mini_batches = []

    permutation = list(np.random.permutation(m))
Example #19
0
def question_5():
    logger.info("EXECUTING: QUESTION 5 - Recommendation System")
    data, R_mat, W_mat = utility.load_dataset()  # load the dataset
    utility.perform_cross_validation_question5(data, R_mat, W_mat)
def question_3():
    logger.info("EXECUTING: QUESTION 3 - Recommendation Systems with threshold limits")
    data, R_mat, _ = utility.load_dataset()  # load the dataset
    utility.perform_cross_validation(data, R_mat, threshold=True)  # perform cross validation with threshold limit
from sklearn import svm
import logging as logger
from collections import Counter
from sklearn import cross_validation
from sklearn.feature_extraction import text
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

logger.basicConfig(level=logger.INFO,format='%(message)s')

categories = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
              'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey']

train_dataset, test_dataset = utility.load_dataset(categories)  # load CT & RA dataset

# combine the classes
processed_train_dataset = copy.deepcopy(train_dataset)
processed_test_dataset = copy.deepcopy(test_dataset)

def combine_classes():
    for i,j in enumerate(processed_train_dataset.target):
        if j >= 0 and j < 4:
            processed_train_dataset.target[i] = 0
        else:
            processed_train_dataset.target[i] = 1

    for i,j in enumerate(processed_test_dataset.target):
        if j >= 0 and j < 4:
            processed_test_dataset.target[i] = 0
Example #22
0
        test_accuracy = test_model(test_loader)
        scheduler.step()
        # import ipdb as pdb; pdb.set_trace()
        print(
            '[{0}] Test Accuracy of the model on the 10000 test images: {1} , lr:{2}, loss:{3}'
            .format(epoch, test_accuracy, get_lr(optimizer), loss.item()))
        # print('Test Accuracy of the model on the 10000 test images: {0}'.format(test_accuracy))


if __name__ == '__main__':
    args = utility.parse_args()
    model_type = args['modelype']
    config_file = args['configfile']
    config = config.Configuration(model_type, config_file)
    print(config.get_config_str())
    config = config.config_dict
    model, criterion, optimizer, scheduler = build_model(config)
    # import ipdb as pdb; pdb.set_trace()
    if torch.cuda.is_available():
        model = model.cuda()
    train_loader, test_loader, train_dataset, test_dataset = utility.load_dataset(
        config)
    if config['operation_mode'] == "inference":
        model_inference(test_loader, config)
    else:
        train_model(model, criterion, optimizer, scheduler, train_loader,
                    train_dataset, test_loader, config)
    # test_model(test_loader)
    # Save the Trained Model
    utility.save_model(config=config, model=model)
Example #23
0
def main(args):
    print >> sys.stderr, "Running Autumn NER model testing module"
    print >> sys.stderr, args
    random.seed(args.seed)

    trainset = []
    devset = []
    testset_standalone = {}
    word_vocab = []

    print "Loading dataset.."
    assert (os.path.isdir(args.datapath))
    for fname in sorted(os.listdir(args.datapath)):
        if os.path.isdir(fname):
            continue

        if fname.endswith('.ner.txt'):
            dataset, vocab = load_dataset(os.path.join(args.datapath, fname))
            word_vocab += vocab
            if fname.endswith('train.ner.txt'):
                trainset += dataset
            if fname.endswith('dev.ner.txt'):
                devset += dataset
            if fname.endswith('test.ner.txt'):
                testset_standalone[fname] = dataset

            print "Loaded {} instances with a vocab size of {} from {}".format(
                len(dataset), len(vocab), fname)

    word_vocab = sorted(set(word_vocab))
    if args.embeddings_path:
        embeddings = load_embeddings(args.embeddings_path, word_vocab, 300)
    else:
        embeddings = None

    print "Loaded {}/{} instances from training/dev set".format(
        len(trainset), len(devset))

    X_train, y_train = zip(*trainset)
    X_dev, y_dev = zip(*devset)

    labels = []
    for lb in y_train + y_dev:
        labels += lb

    labels = sorted(set(labels))

    # Create the model, passing in relevant parameters
    bilstm = AutumnNER(labels=labels,
                       word_vocab=word_vocab,
                       word_embeddings=embeddings,
                       optimizer=args.optimizer,
                       embedding_size=300,
                       char_embedding_size=32,
                       lstm_dim=200,
                       num_cores=args.num_cores,
                       embedding_factor=args.embedding_factor,
                       learning_rate=args.learning_rate,
                       decay_rate=args.decay_rate,
                       dropout_keep=args.keep_prob)

    model_path = './scratch/saved_model_d{}_s{}'.format(
        hash(args.datapath), args.seed)
    if not os.path.exists(model_path + '.meta'):
        if not os.path.exists('./scratch'):
            os.mkdir('./scratch')

        print "Training.."
        bilstm.fit(X_train,
                   y_train,
                   X_dev,
                   y_dev,
                   num_epoch=args.num_epoch,
                   batch_size=args.batch_size,
                   seed=args.seed)

        bilstm.save(model_path)
    else:
        print "Loading saved model.."
        bilstm.restore(model_path)

    print "Evaluating.."
    print "Performance on DEV set ----------------------------"

    report_performance(bilstm, X_dev, y_dev,
                       'evaluation/devset_predictions.txt')

    print "Performance on TEST set(s) ----------------------------"

    overall_testset = []
    for key, testset in testset_standalone:
        X_test, y_test = zip(*testset)
        report_performance(bilstm, X_test, y_test,
                           'evaluation/testset_{}_predictions.txt'.format(key))
        overall_testset += testset

    X_test, y_test = zip(*overall_testset)
    report_performance(bilstm, X_test, y_test,
                       'evaluation/testset_overall_predictions.txt')
Example #24
0
def question_2():
    logger.info(
        "EXECUTING: QUESTION 2 - 10-fold Cross-Validation on Recommendation System"
    )
    data, R_mat, _ = utility.load_dataset()  # load the dataset
    utility.perform_cross_validation(data, R_mat)  # perform cross validation
Example #25
0
    GenNet, DiscNet, space = select_models(opt.dataset, opt.image_size,
                                           opt.weight_decay)
    generator, discriminator, g_snapshot, d_snapshot = init_networks(
        GenNet, DiscNet, opt)

    # Data loading. #######################
    # - As SVRG requires GD step, an additional data loader is instantiated which
    # uses larger batch size (opt.large_batch_size). Analogous hold for noise data.
    # - To ensure that in expectation the noise vanishes (what reduces SVRG to SGD
    # [*]), svrg_noise_sampler & noise_sampler use the same noise tensor. This
    # noise tensor is re-sampled from p_z by noise_sampler, after its full traverse.
    #
    # [*] Accelerating stochastic gradient descent using predictive variance reduction,
    # Johnson & Zhang, Advances in Neural Information Processing Systems, 2013.

    dataset = load_dataset(opt.dataset, opt.dataroot, opt.verbose)
    data_sampler = dataset_generator(dataset,
                                     opt.batch_size,
                                     num_workers=opt.n_workers,
                                     drop_last=True)
    _n_batches = len(dataset) // opt.batch_size
    svrg_freq_sampler = bernoulli.Bernoulli(torch.tensor([1 / _n_batches]))
    noise_dataset = torch.FloatTensor(2 * len(dataset),
                                      _NOISE_DIM).normal_(0, 1)
    noise_sampler = noise_generator(noise_dataset,
                                    opt.batch_size,
                                    drop_last=True,
                                    resample=True)
    logger.info(
        "{} loaded. Found {} samples, resulting in {} mini-batches.".format(
            opt.dataset, len(dataset), _n_batches))
from sklearn import cross_validation
from sklearn.feature_extraction import text
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

logger.basicConfig(level=logger.INFO, format='%(message)s')

categories = [
    'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
    'comp.sys.mac.hardware', 'rec.autos', 'rec.motorcycles',
    'rec.sport.baseball', 'rec.sport.hockey'
]

train_dataset, test_dataset = utility.load_dataset(
    categories)  # load CT & RA dataset

# combine the classes
processed_train_dataset = copy.deepcopy(train_dataset)
processed_test_dataset = copy.deepcopy(test_dataset)


def combine_classes():
    for i, j in enumerate(processed_train_dataset.target):
        if j >= 0 and j < 4:
            processed_train_dataset.target[i] = 0
        else:
            processed_train_dataset.target[i] = 1

    for i, j in enumerate(processed_test_dataset.target):
        if j >= 0 and j < 4:
def question_5():
    logger.info("EXECUTING: QUESTION 5 - Recommendation System")
    data, R_mat, W_mat = utility.load_dataset()  # load the dataset
    utility.perform_cross_validation_question5(data,R_mat,W_mat)
Example #28
0
def question_1():
    logger.info("EXECUTING: QUESTION 1 - Least Square Factorization")
    data, R_mat, W_mat = utility.load_dataset()  # load the dataset
    utility.compute_least_squared_error(
        R_mat, W_mat)  # compute the least squared error without regularization
def experiment_BeamSearch(input_dir,
                          output_fname,
                          train_size,
                          thinking_budget,
                          beam_width,
                          children_count,
                          loss_type,
                          seed,
                          homogeneous=False,
                          print_vector=False,
                          num_cpus=1):
    """
    perform beam search on a given dataset using logistic
    regression learner
    :param input_dir: string, path to the input directory
    :param output_fname: string, path to the output file
    :param train_size: int, size of the train set
    :param thinking_budget: int, the thinking budget
    :param beam_width: int, the width of the beam
    :param children_count: int, number of children to consider for each
    sequence
    :param loss_type: string, type of loss to use
    :param seed: int, the random seed to use
    :param homogeneous: boolean, if the learner is homogeneous or not,
    default False
    :param print_vector: boolean, if True then print the final vector,
    default False
    :param num_cpus: int, the number of CPUs to use
    """
    # load the data
    cover_X, cover_Y, val_X, val_Y, test_X, test_Y, kernel_matrix = \
        load_dataset(input_dir)

    loss_function = loss_01 if loss_type == '01' else loss_logistic

    # parameters
    max_K, alpha = 1, 0.05
    learner_params = {'random_state': seed, 'fit_intercept': not homogeneous}

    search_alg = BeamSearch(load_lr_learner,
                            fit_lr_learner,
                            learner_params,
                            select_instances,
                            cover_X,
                            cover_Y,
                            val_X,
                            val_Y,
                            kernel_matrix,
                            max_K,
                            loss_function,
                            train_size,
                            beam_width,
                            children_count,
                            thinking_budget=thinking_budget,
                            alpha=alpha,
                            output_fname=output_fname,
                            seed=seed,
                            num_cpus=num_cpus)
    results = search_alg.search_optimum_teaching_seq()

    print('\n\nInput Arguments')
    print('Input Dir: {}'.format(input_dir))
    print('Output File: {}'.format(output_fname))
    print('Size of train set: {}'.format(train_size))
    print('Thinking budget: {}'.format(thinking_budget))
    print('Beam width: {}'.format(beam_width))
    print('Children count: {}'.format(children_count))
    print('Loss Function: {}'.format(loss_type))
    print('Seed: {}'.format(seed))
    print('Homogeneous: {}'.format(homogeneous))
    print('Print Vector: {}'.format(print_vector))

    print('\n\nOutput')
    print('Length of optimal sequence: {}'.format(len(results['opt_indices'])))

    # get the error on after training
    learner = load_lr_learner(learner_params)
    opt_indices = results['opt_indices']
    train_X, train_Y = select_instances(cover_X, cover_Y, opt_indices)
    learner.fit(train_X, train_Y)
    cover_error, val_error, test_error = \
        1.0 - learner.score(cover_X, cover_Y), \
        1.0 - learner.score(val_X, val_Y), \
        1.0 - learner.score(test_X, test_Y)
    print('01 loss: Cover error: {}, Validation error: {}, Test error: {}'.
          format(cover_error, val_error, test_error))
    cover_error, val_error, test_error = \
        loss_logistic(learner, cover_X, cover_Y), \
        loss_logistic(learner, val_X, val_Y), \
        loss_logistic(learner, test_X, test_Y)
    print(
        'logistic loss: Cover error: {}, Validation error: {}, Test error: {}'.
        format(cover_error, val_error, test_error))

    # count positive and negative instances
    pos_count = 0
    for y in train_Y:
        if y == 1:
            pos_count += 1
    print('Pos count: {}, Neg count: {}'.format(pos_count,
                                                len(train_Y) - pos_count))

    # print vectors
    if print_vector:
        if not homogeneous:
            weights = np.zeros(len(learner.coef_[0]) + 1)
            weights[0] = learner.intercept_[0]
            weights[1:] = learner.coef_[0]
            print('Weights: {}'.format(weights))
        else:
            print('Weights: {}'.format(learner.coef_[0]))