def main(train_path, test_path): train_x, train_t = util.load_dataset(train_path, label_col='class') test_x, test_t = util.load_dataset(test_path, label_col='class') train_x_inter = util.add_intercept(train_x) test_x_inter = util.add_intercept(test_x) #print(test_x_inter) classifier_t = LogisticRegression(max_iter=10000, step_size=0.0001, verbose=False) classifier_t.fit(train_x_inter, train_t) pred_t_prob = classifier_t.predict(test_x_inter) for i in range(len(test_t)): print(f"Predicted value: {pred_t_prob[i]} True Value: {test_t[i]}")
def report_average_cost(model, criterion, optimizer, list_seq_num, list_loss, list_cost, config_obj): list_avg_loss = [] list_avg_cost = [] list_T_num = [] for T in range(10, 110, 10): print("Evaluating {0} model on sequence size {1}".format( config['model_type'], T)) config_obj.config_dict['num_batches'] = 1 config_obj.config_dict['batch_size'] = 1 seqs_loader = utility.load_dataset(config_obj, max=T, min=T) if config['model_type'] == 'LSTM': avg_loss, avg_cost = evaluate_lstm(model, criterion, optimizer, seqs_loader, config) else: avg_loss, avg_cost = evaluate_ntm(model, criterion, optimizer, seqs_loader, config) list_avg_loss.append(avg_loss) list_avg_cost.append(avg_cost) list_T_num.append(T) saveCheckpoint(model, list_T_num, list_avg_loss, list_avg_cost, path="{0}_Ts".format(config['filename'])) model, list_T_num, list_avg_loss, list_avg_cost = loadCheckpoint( path="{0}_Ts".format(config['filename'])) plt.plot(list_T_num, list_avg_cost) plt.xlabel('T') plt.ylabel('average cost') plt.savefig('{0}_average_cost.pdf'.format(config['filename']))
def question_3(): logger.info( "EXECUTING: QUESTION 3 - Recommendation Systems with threshold limits") data, R_mat, _ = utility.load_dataset() # load the dataset utility.perform_cross_validation( data, R_mat, threshold=True) # perform cross validation with threshold limit
def question_4(): logger.info("EXECUTING: QUESTION 4 - Recommendation Systems with Regularization") data, R_mat, W_mat = utility.load_dataset() logger.info("R & W Matrix - Interchanged") utility.compute_least_squared_error(W_mat, R_mat) # interchange R & W matrix logger.info("R & W Matrix - Regularization") utility.compute_least_squared_error(R_mat, W_mat,regularized=True)
def question_c(): logger.info("EXECUTING: QUESTION C") # get training data for every category and terms with their frequency for every class all_categories = train_all_dataset.target_names freq_words_all_categories=[] words_all_categories =[] all_data_category = [] words_in_classes = defaultdict(list) find_for_classes_list = [train_all_dataset.target_names.index("comp.sys.ibm.pc.hardware"), train_all_dataset.target_names.index("comp.sys.mac.hardware"), train_all_dataset.target_names.index("misc.forsale"), train_all_dataset.target_names.index("soc.religion.christian")] logger.info("Collecting data for each category") for category in all_categories: train_category = utility.load_dataset([category])[0] data_category = train_category.data temp = '' for document in data_category: temp += ' ' + document all_data_category.append(temp) logger.info("Cleaning Data and Forming Frequency List for each Class") # pre-process all the docs for data,pos in zip(all_data_category,range(len(all_data_category))): logger.info("Forming Frequency List for Class: {}".format(train_all_dataset.target_names[pos])) processed_data = utility.preprocess_data(data) count = Counter(processed_data) freq_words_all_categories.append(count) unique_words = set(processed_data) words_all_categories.append(list(unique_words)) for word in unique_words: words_in_classes[word].append(train_all_dataset.target_names[pos]) # calculating tf-icf for category in find_for_classes_list: logger.info("Fetching top 10 significant terms for class: {}".format(train_all_dataset.target_names[category])) terms_of_class = words_all_categories[category] freq_of_all_terms = freq_words_all_categories[category] number_of_terms = len(terms_of_class) tficf = {} for each_term in range(number_of_terms): term= terms_of_class[each_term] # term for which we are finding tf-icf frequency_of_term = freq_of_all_terms.get(term) number_of_class_with_term = len(words_in_classes[term]) # number of classes with term t # tficf for term t calc = 0.5 + ((0.5 * frequency_of_term/number_of_terms) * math.log(len(train_all_dataset.target_names) / number_of_class_with_term)) tficf[term]=calc # print top 10 significant term for this class significant_terms = dict(sorted(tficf.iteritems(), key=operator.itemgetter(1), reverse=True)[:10]) #get 10 significant terms logger.info(significant_terms.keys())
def visualize_read_write(model, criterion, optimizer, config_obj): T = 10 config_obj.config_dict['num_batches'] = 20 config_obj.config_dict['batch_size'] = 1 seqs_loader = utility.load_dataset(config_obj, max=T, min=T) for batch_num, X, Y, act in seqs_loader: result = evaluate_single_batch(model, criterion, X, Y) plot_visualization(X, result, model.N)
def question_4(): logger.info( "EXECUTING: QUESTION 4 - Recommendation Systems with Regularization") data, R_mat, W_mat = utility.load_dataset() logger.info("R & W Matrix - Interchanged") utility.compute_least_squared_error(W_mat, R_mat) # interchange R & W matrix logger.info("R & W Matrix - Regularization") utility.compute_least_squared_error(R_mat, W_mat, regularized=True)
def question_i(): logger.info("EXECUTING: QUESTION I") logger.info("Multi-Class Classification") category = ['comp.sys.ibm.pc.hardware','comp.sys.mac.hardware','misc.forsale','soc.religion.christian'] train, test = utility.load_dataset(category) logger.info("Processing Training Dataset") for data,pos in zip(train.data,range(len(train.data))): processedData = utility.preprocess_data(data) train.data[pos] = ' '.join(processedData) logger.info("Processing Testing Dataset") for data,pos in zip(test.data,range(len(test.data))): processedData = utility.preprocess_data(data) test.data[pos] = ' '.join(processedData) logger.info("Creating TFxIDF Vector Representations") stop_words = text.ENGLISH_STOP_WORDS # omit stop words # using CountVectorizer and TFxIDF Transformer count_vect = CountVectorizer(stop_words=stop_words, lowercase=True) train_counts = count_vect.fit_transform(train.data) test_counts = count_vect.transform(test.data) tfidf_transformer = TfidfTransformer(norm='l2', sublinear_tf=True) train_idf = tfidf_transformer.fit_transform(train_counts) test_idf = tfidf_transformer.transform(test_counts) logger.info("Performing LSI on TFxIDF Matrices") # apply LSI to TDxIDF matrices svd = TruncatedSVD(n_components=50) train_lsi = svd.fit_transform(train_idf) test_lsi = svd.transform(test_idf) logger.info("TFxIDF Matrices Transformed") logger.info("Size of Transformed Training Dataset: {0}".format(train_lsi.shape)) logger.info("Size of Transformed Testing Dataset: {0}".format(test_lsi.shape)) clf_list = [OneVsOneClassifier(GaussianNB()), OneVsOneClassifier(svm.SVC(kernel='linear')), OneVsRestClassifier(GaussianNB()), OneVsRestClassifier(svm.SVC(kernel='linear'))] clf_name = ['OneVsOneClassifier Naive Bayes', 'OneVsOneClassifier SVM','OneVsRestClassifier Naive Bayes', 'OneVsRestClassifier SVM'] # perform classification for clf,clf_n in zip(clf_list,clf_name): logger.info("Training {0} Classifier ".format(clf_n)) clf.fit(train_lsi, train.target) logger.info("Testing {0} Classifier".format(clf_n)) test_predicted = clf.predict(test_lsi) utility.calculate_statistics(test.target, test_predicted)
def train_model(model, config, criterion, optimizer): losses = 0 iter = 0 # record the performance for this epoch train_data, valid_data, test_data = utility.load_dataset("mnist.pkl", config) record_performance(train_data, model, criterion, "train") record_performance(valid_data, model, criterion, "valid") record_performance(test_data, model, criterion, "test") for epoch in range(config.num_epochs): # iterate over batches # the shape of train_data[0] must be 500 x 100 x 784 # the shape of train_data[1] must be 500 x 100 for i in range(train_data[0].shape[0]): optimizer.zero_grad() x = Variable(torch.from_numpy(train_data[0][i])) y = Variable(torch.from_numpy(train_data[1][i])) optimizer.zero_grad() # compute loss loss = criterion(model(x), y) # compute gradients and update parameters loss.backward() # take one SGD step optimizer.step() # record the performance for this epoch train_loss, train_acc = record_performance(train_data, model, criterion, "train") valid_loss, valid_acc = record_performance(valid_data, model, criterion, "valid") test_loss, test_acc = record_performance(test_data, model, criterion, "test") # print the results for this epoch #print("Epoch {0} \nLoss : {1:.3f} \nAcc : {2:.3f}".format(epoch, train_loss, train_acc)) # print the results for this epoch if int(config.filename.split("Q1_")[1]) < 6: print("Validation Results:\nLoss : {0:.3f} Acc : {1:.3f}".format(valid_loss, valid_acc)) data_to_plot = (records["train"], records["valid"]) # data_to_plot = (records["train"]) utility.plot_sample_data(data_to_plot, config, "Training Loss using {0} initialization".format(config.init_type), True) # utility.plot_sample_data(data_to_plot, config, "Training Loss using {0} initialization".format(config.init_type), True, True) else: max_indx = np.argmax(list(records["valid"][1])) train_acc_data = list(records["train"][1])[max_indx] test_acc_data = list(records["test"][1])[max_indx] print("Best validation set was at epoch {0}".format(max_indx)) print("Generalization Gap : {0} - {1} = {2}".format(train_acc_data, test_acc_data, train_acc_data-test_acc_data))
def create_labeled_data(): """ Creates the labeled data set. First we load the unlabeled data with utility.py Then applying labeling and transformation functions. """ df_train, sentences_number = utility.load_dataset() df_dev = pd.read_csv(r'data\dev_22.12.csv') df_train.to_csv(r'data\df_train.csv', index=False) df_train_labeled = apply_lf_on_data(df_train,df_dev,sentences_number) df_train_labeled.to_csv(r'data\labeled_data.csv', index=False) augmented = apply_tf_on_data(df_train_labeled) # Splitting to test and train: df_train_augmented, df_test = train_test_split(augmented, test_size=TEST_RATIO) df_test.to_csv(r'data\df_test.csv', index=False) df_train_augmented.to_csv(r'data\labeled_data_augmented.csv', index=False) return df_train_labeled,df_train_augmented,df_test
def main(args): print >> sys.stderr, "Running Autumn NER model training module" print >> sys.stderr, args random.seed(args.seed) trainset = [] devset = [] print >> sys.stderr, "Loading dataset.." assert (os.path.isdir(args.datapath)) word_vocab = [] for fname in sorted(os.listdir(args.datapath)): if os.path.isdir(fname): continue if fname.endswith('train.ner.txt') or fname.endswith('dev.ner.txt'): dataset, vocab = load_dataset(os.path.join(args.datapath, fname)) word_vocab += vocab if fname.endswith('train.ner.txt'): trainset += dataset if fname.endswith('dev.ner.txt'): devset += dataset print >> sys.stderr, "Loaded {} instances with a vocab size of {} from {}".format( len(dataset), len(vocab), fname) if args.embeddings_path: embeddings = load_embeddings(args.embeddings_path, word_vocab, 300) else: embeddings = None print "Loaded {} instances from data set".format(len(trainset)) random.shuffle(trainset) X_train, y_train = zip(*trainset) X_dev, y_dev = zip(*devset) print "Training on {}, tuning on {}".format(len(X_train), len(X_dev)) labels = [] for lb in y_train + y_dev: labels += lb if os.path.exists('./saved_model'): os.rename('./saved_model', './scratch/saved_model_{}'.format(time.time())) os.mkdir('./saved_model') word_vocab = sorted(set(word_vocab)) with open(os.path.join('saved_model', 'word_vocab.pickle'), 'w') as f: pickle.dump(word_vocab, f) labels = sorted(set(labels)) with open(os.path.join('saved_model', 'label_space.pickle'), 'w') as f: pickle.dump(labels, f) # Create the model, passing in relevant parameters bilstm = AutumnNER(labels=labels, word_vocab=word_vocab, word_embeddings=embeddings, optimizer=args.optimizer, embedding_size=300, char_embedding_size=32, lstm_dim=200, num_cores=args.num_cores, embedding_factor=args.embedding_factor, learning_rate=args.learning_rate, decay_rate=args.decay_rate, dropout_keep=args.keep_prob) if not os.path.exists('./scratch'): os.mkdir('./scratch') print "Training.." bilstm.fit(X_train, y_train, X_dev, y_dev, num_epoch=args.num_epoch, batch_size=args.batch_size, seed=args.seed) bilstm.save('./saved_model/main') print "Training complete" print "Reporting performance on devset.." report_performance(bilstm, X_dev, y_dev, 'saved_model/devset_outcome.txt')
if __name__ == '__main__': # pdb.set_trace() args = utility.parse_args() config_type = args['configtype'] config_file = args['configfile'] load_checkpoint = args['load_checkpoint'] plot_all_average_flag = args['plot_all_average'] visualize_read_write_flag = args['visualize_read_write'] if plot_all_average_flag: plot_all_average_costs() else: config_obj = config.Configuration(config_type, config_file) config = config_obj.config_dict model, criterion, optimizer = models.build_model(config) seqs_loader = utility.load_dataset(config_obj) if visualize_read_write_flag: model, list_seq_num, list_loss, list_cost = loadCheckpoint( path=config['filename']) visualize_read_write(model, criterion, optimizer, config_obj) else: if not load_checkpoint: str_info = "{0}number of parameters: {1}\n".format( config_obj.get_config_str(), model.calculate_num_params()) print(str_info) if config['model_type'] == "LSTM": if not load_checkpoint: list_seq_num, list_loss, list_cost = train_lstm_model( config, model, criterion, optimizer, seqs_loader) report_result(model, criterion, optimizer, list_seq_num, list_loss, list_cost, config_obj,
def question_2(): logger.info("EXECUTING: QUESTION 2 - 10-fold Cross-Validation on Recommendation System") data, R_mat, _ = utility.load_dataset() # load the dataset utility.perform_cross_validation(data, R_mat) # perform cross validation
def question_i(): logger.info("EXECUTING: QUESTION I") logger.info("Multi-Class Classification") category = [ 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'misc.forsale', 'soc.religion.christian' ] train, test = utility.load_dataset(category) logger.info("Processing Training Dataset") for data, pos in zip(train.data, range(len(train.data))): processedData = utility.preprocess_data(data) train.data[pos] = ' '.join(processedData) logger.info("Processing Testing Dataset") for data, pos in zip(test.data, range(len(test.data))): processedData = utility.preprocess_data(data) test.data[pos] = ' '.join(processedData) logger.info("Creating TFxIDF Vector Representations") stop_words = text.ENGLISH_STOP_WORDS # omit stop words # using CountVectorizer and TFxIDF Transformer count_vect = CountVectorizer(stop_words=stop_words, lowercase=True) train_counts = count_vect.fit_transform(train.data) test_counts = count_vect.transform(test.data) tfidf_transformer = TfidfTransformer(norm='l2', sublinear_tf=True) train_idf = tfidf_transformer.fit_transform(train_counts) test_idf = tfidf_transformer.transform(test_counts) logger.info("Performing LSI on TFxIDF Matrices") # apply LSI to TDxIDF matrices svd = TruncatedSVD(n_components=50) train_lsi = svd.fit_transform(train_idf) test_lsi = svd.transform(test_idf) logger.info("TFxIDF Matrices Transformed") logger.info("Size of Transformed Training Dataset: {0}".format( train_lsi.shape)) logger.info("Size of Transformed Testing Dataset: {0}".format( test_lsi.shape)) clf_list = [ OneVsOneClassifier(GaussianNB()), OneVsOneClassifier(svm.SVC(kernel='linear')), OneVsRestClassifier(GaussianNB()), OneVsRestClassifier(svm.SVC(kernel='linear')) ] clf_name = [ 'OneVsOneClassifier Naive Bayes', 'OneVsOneClassifier SVM', 'OneVsRestClassifier Naive Bayes', 'OneVsRestClassifier SVM' ] # perform classification for clf, clf_n in zip(clf_list, clf_name): logger.info("Training {0} Classifier ".format(clf_n)) clf.fit(train_lsi, train.target) logger.info("Testing {0} Classifier".format(clf_n)) test_predicted = clf.predict(test_lsi) utility.calculate_statistics(test.target, test_predicted)
def question_c(): logger.info("EXECUTING: QUESTION C") # get training data for every category and terms with their frequency for every class all_categories = train_all_dataset.target_names freq_words_all_categories = [] words_all_categories = [] all_data_category = [] words_in_classes = defaultdict(list) find_for_classes_list = [ train_all_dataset.target_names.index("comp.sys.ibm.pc.hardware"), train_all_dataset.target_names.index("comp.sys.mac.hardware"), train_all_dataset.target_names.index("misc.forsale"), train_all_dataset.target_names.index("soc.religion.christian") ] logger.info("Collecting data for each category") for category in all_categories: train_category = utility.load_dataset([category])[0] data_category = train_category.data temp = '' for document in data_category: temp += ' ' + document all_data_category.append(temp) logger.info("Cleaning Data and Forming Frequency List for each Class") # pre-process all the docs for data, pos in zip(all_data_category, range(len(all_data_category))): logger.info("Forming Frequency List for Class: {}".format( train_all_dataset.target_names[pos])) processed_data = utility.preprocess_data(data) count = Counter(processed_data) freq_words_all_categories.append(count) unique_words = set(processed_data) words_all_categories.append(list(unique_words)) for word in unique_words: words_in_classes[word].append(train_all_dataset.target_names[pos]) # calculating tf-icf for category in find_for_classes_list: logger.info("Fetching top 10 significant terms for class: {}".format( train_all_dataset.target_names[category])) terms_of_class = words_all_categories[category] freq_of_all_terms = freq_words_all_categories[category] number_of_terms = len(terms_of_class) tficf = {} for each_term in range(number_of_terms): term = terms_of_class[ each_term] # term for which we are finding tf-icf frequency_of_term = freq_of_all_terms.get(term) number_of_class_with_term = len( words_in_classes[term]) # number of classes with term t # tficf for term t calc = 0.5 + ( (0.5 * frequency_of_term / number_of_terms) * math.log( len(train_all_dataset.target_names) / number_of_class_with_term)) tficf[term] = calc # print top 10 significant term for this class significant_terms = dict( sorted(tficf.iteritems(), key=operator.itemgetter(1), reverse=True)[:10]) #get 10 significant terms logger.info(significant_terms.keys())
def question_1(): logger.info("EXECUTING: QUESTION 1 - Least Square Factorization") data, R_mat, W_mat = utility.load_dataset() # load the dataset utility.compute_least_squared_error(R_mat, W_mat) # compute the least squared error without regularization
plt.subplot(n, n, j + 1) feature = subset[subset.columns[j]] if y is None: # Plot the feature distribution (full) sns.distplot(feature) else: # Plot feature distrbituions separated by class for label in labels: feature_label = feature[y == label] sns.distplot(feature_label, label=label, hist_kws={"alpha": 0.4}) plt.legend() if __name__ == "__main__": utility.setup_logging(params['results_dir']) data_df = utility.load_dataset(params['data_file']) X_all = data_df.drop(columns=['Series', 'Class'], errors='ignore') y_all = data_df['Class'] plot_class_distribution(y_all) X_pca_2d = do_pca(X_all, num_components=2) plot_dataset_2d(X_pca_2d, y_all) plot_correlation_matrix(X_all) plot_feature_distributions_nxn_grid(X_all, n=3) plot_feature_distributions_nxn_grid(X_all, y_all, n=3)
# -*- coding: utf-8 -*- """ Created on Thu Mar 1 11:41:39 2018 @author: jaydeep thik """ import tensorflow as tf import numpy as np import math import h5py import matplotlib.pyplot as plt from PIL import Image from utility import load_dataset, encode_one_hot X_train, X_test, y_train, y_test, classes = load_dataset() #plt.imshow(X_train[0]) #print(y_train[:,0]) X_train, X_test = X_train / 255., X_test / 255. y_train = encode_one_hot(y_train, len(classes)) y_test = encode_one_hot(y_test, len(classes)) def random_mini_batches(X, Y, mini_batch_size=64, seed=0): m = X.shape[0] mini_batches = [] permutation = list(np.random.permutation(m))
def question_5(): logger.info("EXECUTING: QUESTION 5 - Recommendation System") data, R_mat, W_mat = utility.load_dataset() # load the dataset utility.perform_cross_validation_question5(data, R_mat, W_mat)
def question_3(): logger.info("EXECUTING: QUESTION 3 - Recommendation Systems with threshold limits") data, R_mat, _ = utility.load_dataset() # load the dataset utility.perform_cross_validation(data, R_mat, threshold=True) # perform cross validation with threshold limit
from sklearn import svm import logging as logger from collections import Counter from sklearn import cross_validation from sklearn.feature_extraction import text from sklearn.naive_bayes import GaussianNB from sklearn.decomposition import TruncatedSVD from sklearn.linear_model import LogisticRegression from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer logger.basicConfig(level=logger.INFO,format='%(message)s') categories = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey'] train_dataset, test_dataset = utility.load_dataset(categories) # load CT & RA dataset # combine the classes processed_train_dataset = copy.deepcopy(train_dataset) processed_test_dataset = copy.deepcopy(test_dataset) def combine_classes(): for i,j in enumerate(processed_train_dataset.target): if j >= 0 and j < 4: processed_train_dataset.target[i] = 0 else: processed_train_dataset.target[i] = 1 for i,j in enumerate(processed_test_dataset.target): if j >= 0 and j < 4: processed_test_dataset.target[i] = 0
test_accuracy = test_model(test_loader) scheduler.step() # import ipdb as pdb; pdb.set_trace() print( '[{0}] Test Accuracy of the model on the 10000 test images: {1} , lr:{2}, loss:{3}' .format(epoch, test_accuracy, get_lr(optimizer), loss.item())) # print('Test Accuracy of the model on the 10000 test images: {0}'.format(test_accuracy)) if __name__ == '__main__': args = utility.parse_args() model_type = args['modelype'] config_file = args['configfile'] config = config.Configuration(model_type, config_file) print(config.get_config_str()) config = config.config_dict model, criterion, optimizer, scheduler = build_model(config) # import ipdb as pdb; pdb.set_trace() if torch.cuda.is_available(): model = model.cuda() train_loader, test_loader, train_dataset, test_dataset = utility.load_dataset( config) if config['operation_mode'] == "inference": model_inference(test_loader, config) else: train_model(model, criterion, optimizer, scheduler, train_loader, train_dataset, test_loader, config) # test_model(test_loader) # Save the Trained Model utility.save_model(config=config, model=model)
def main(args): print >> sys.stderr, "Running Autumn NER model testing module" print >> sys.stderr, args random.seed(args.seed) trainset = [] devset = [] testset_standalone = {} word_vocab = [] print "Loading dataset.." assert (os.path.isdir(args.datapath)) for fname in sorted(os.listdir(args.datapath)): if os.path.isdir(fname): continue if fname.endswith('.ner.txt'): dataset, vocab = load_dataset(os.path.join(args.datapath, fname)) word_vocab += vocab if fname.endswith('train.ner.txt'): trainset += dataset if fname.endswith('dev.ner.txt'): devset += dataset if fname.endswith('test.ner.txt'): testset_standalone[fname] = dataset print "Loaded {} instances with a vocab size of {} from {}".format( len(dataset), len(vocab), fname) word_vocab = sorted(set(word_vocab)) if args.embeddings_path: embeddings = load_embeddings(args.embeddings_path, word_vocab, 300) else: embeddings = None print "Loaded {}/{} instances from training/dev set".format( len(trainset), len(devset)) X_train, y_train = zip(*trainset) X_dev, y_dev = zip(*devset) labels = [] for lb in y_train + y_dev: labels += lb labels = sorted(set(labels)) # Create the model, passing in relevant parameters bilstm = AutumnNER(labels=labels, word_vocab=word_vocab, word_embeddings=embeddings, optimizer=args.optimizer, embedding_size=300, char_embedding_size=32, lstm_dim=200, num_cores=args.num_cores, embedding_factor=args.embedding_factor, learning_rate=args.learning_rate, decay_rate=args.decay_rate, dropout_keep=args.keep_prob) model_path = './scratch/saved_model_d{}_s{}'.format( hash(args.datapath), args.seed) if not os.path.exists(model_path + '.meta'): if not os.path.exists('./scratch'): os.mkdir('./scratch') print "Training.." bilstm.fit(X_train, y_train, X_dev, y_dev, num_epoch=args.num_epoch, batch_size=args.batch_size, seed=args.seed) bilstm.save(model_path) else: print "Loading saved model.." bilstm.restore(model_path) print "Evaluating.." print "Performance on DEV set ----------------------------" report_performance(bilstm, X_dev, y_dev, 'evaluation/devset_predictions.txt') print "Performance on TEST set(s) ----------------------------" overall_testset = [] for key, testset in testset_standalone: X_test, y_test = zip(*testset) report_performance(bilstm, X_test, y_test, 'evaluation/testset_{}_predictions.txt'.format(key)) overall_testset += testset X_test, y_test = zip(*overall_testset) report_performance(bilstm, X_test, y_test, 'evaluation/testset_overall_predictions.txt')
def question_2(): logger.info( "EXECUTING: QUESTION 2 - 10-fold Cross-Validation on Recommendation System" ) data, R_mat, _ = utility.load_dataset() # load the dataset utility.perform_cross_validation(data, R_mat) # perform cross validation
GenNet, DiscNet, space = select_models(opt.dataset, opt.image_size, opt.weight_decay) generator, discriminator, g_snapshot, d_snapshot = init_networks( GenNet, DiscNet, opt) # Data loading. ####################### # - As SVRG requires GD step, an additional data loader is instantiated which # uses larger batch size (opt.large_batch_size). Analogous hold for noise data. # - To ensure that in expectation the noise vanishes (what reduces SVRG to SGD # [*]), svrg_noise_sampler & noise_sampler use the same noise tensor. This # noise tensor is re-sampled from p_z by noise_sampler, after its full traverse. # # [*] Accelerating stochastic gradient descent using predictive variance reduction, # Johnson & Zhang, Advances in Neural Information Processing Systems, 2013. dataset = load_dataset(opt.dataset, opt.dataroot, opt.verbose) data_sampler = dataset_generator(dataset, opt.batch_size, num_workers=opt.n_workers, drop_last=True) _n_batches = len(dataset) // opt.batch_size svrg_freq_sampler = bernoulli.Bernoulli(torch.tensor([1 / _n_batches])) noise_dataset = torch.FloatTensor(2 * len(dataset), _NOISE_DIM).normal_(0, 1) noise_sampler = noise_generator(noise_dataset, opt.batch_size, drop_last=True, resample=True) logger.info( "{} loaded. Found {} samples, resulting in {} mini-batches.".format( opt.dataset, len(dataset), _n_batches))
from sklearn import cross_validation from sklearn.feature_extraction import text from sklearn.naive_bayes import GaussianNB from sklearn.decomposition import TruncatedSVD from sklearn.linear_model import LogisticRegression from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer logger.basicConfig(level=logger.INFO, format='%(message)s') categories = [ 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey' ] train_dataset, test_dataset = utility.load_dataset( categories) # load CT & RA dataset # combine the classes processed_train_dataset = copy.deepcopy(train_dataset) processed_test_dataset = copy.deepcopy(test_dataset) def combine_classes(): for i, j in enumerate(processed_train_dataset.target): if j >= 0 and j < 4: processed_train_dataset.target[i] = 0 else: processed_train_dataset.target[i] = 1 for i, j in enumerate(processed_test_dataset.target): if j >= 0 and j < 4:
def question_5(): logger.info("EXECUTING: QUESTION 5 - Recommendation System") data, R_mat, W_mat = utility.load_dataset() # load the dataset utility.perform_cross_validation_question5(data,R_mat,W_mat)
def question_1(): logger.info("EXECUTING: QUESTION 1 - Least Square Factorization") data, R_mat, W_mat = utility.load_dataset() # load the dataset utility.compute_least_squared_error( R_mat, W_mat) # compute the least squared error without regularization
def experiment_BeamSearch(input_dir, output_fname, train_size, thinking_budget, beam_width, children_count, loss_type, seed, homogeneous=False, print_vector=False, num_cpus=1): """ perform beam search on a given dataset using logistic regression learner :param input_dir: string, path to the input directory :param output_fname: string, path to the output file :param train_size: int, size of the train set :param thinking_budget: int, the thinking budget :param beam_width: int, the width of the beam :param children_count: int, number of children to consider for each sequence :param loss_type: string, type of loss to use :param seed: int, the random seed to use :param homogeneous: boolean, if the learner is homogeneous or not, default False :param print_vector: boolean, if True then print the final vector, default False :param num_cpus: int, the number of CPUs to use """ # load the data cover_X, cover_Y, val_X, val_Y, test_X, test_Y, kernel_matrix = \ load_dataset(input_dir) loss_function = loss_01 if loss_type == '01' else loss_logistic # parameters max_K, alpha = 1, 0.05 learner_params = {'random_state': seed, 'fit_intercept': not homogeneous} search_alg = BeamSearch(load_lr_learner, fit_lr_learner, learner_params, select_instances, cover_X, cover_Y, val_X, val_Y, kernel_matrix, max_K, loss_function, train_size, beam_width, children_count, thinking_budget=thinking_budget, alpha=alpha, output_fname=output_fname, seed=seed, num_cpus=num_cpus) results = search_alg.search_optimum_teaching_seq() print('\n\nInput Arguments') print('Input Dir: {}'.format(input_dir)) print('Output File: {}'.format(output_fname)) print('Size of train set: {}'.format(train_size)) print('Thinking budget: {}'.format(thinking_budget)) print('Beam width: {}'.format(beam_width)) print('Children count: {}'.format(children_count)) print('Loss Function: {}'.format(loss_type)) print('Seed: {}'.format(seed)) print('Homogeneous: {}'.format(homogeneous)) print('Print Vector: {}'.format(print_vector)) print('\n\nOutput') print('Length of optimal sequence: {}'.format(len(results['opt_indices']))) # get the error on after training learner = load_lr_learner(learner_params) opt_indices = results['opt_indices'] train_X, train_Y = select_instances(cover_X, cover_Y, opt_indices) learner.fit(train_X, train_Y) cover_error, val_error, test_error = \ 1.0 - learner.score(cover_X, cover_Y), \ 1.0 - learner.score(val_X, val_Y), \ 1.0 - learner.score(test_X, test_Y) print('01 loss: Cover error: {}, Validation error: {}, Test error: {}'. format(cover_error, val_error, test_error)) cover_error, val_error, test_error = \ loss_logistic(learner, cover_X, cover_Y), \ loss_logistic(learner, val_X, val_Y), \ loss_logistic(learner, test_X, test_Y) print( 'logistic loss: Cover error: {}, Validation error: {}, Test error: {}'. format(cover_error, val_error, test_error)) # count positive and negative instances pos_count = 0 for y in train_Y: if y == 1: pos_count += 1 print('Pos count: {}, Neg count: {}'.format(pos_count, len(train_Y) - pos_count)) # print vectors if print_vector: if not homogeneous: weights = np.zeros(len(learner.coef_[0]) + 1) weights[0] = learner.intercept_[0] weights[1:] = learner.coef_[0] print('Weights: {}'.format(weights)) else: print('Weights: {}'.format(learner.coef_[0]))