def _test_regression(dataset='mauna_loa', k=1, dist_metric='l2', d=2): """ compute test loss on regression dataset Inputs: dataset: (str) name of dataset k: (int) number of nearest neighbours to test on dist_metric: (str) 'l1' or 'l2' d : (int, optional) if name='rosenbrock' the specify the dataset dimensionality Outputs: RMSE on test set of the dataset """ if dataset == 'rosenbrock': x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset( 'rosenbrock', n_train=5000, d=d) else: x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset( dataset) x_train = np.vstack([x_valid, x_train]) y_train = np.vstack([y_valid, y_train]) return _eval_knn([k, k + 1], x_train, y_train, x_test, y_test, dist_metric, compute_loss=True)
def convert_h5(data_dir, label_dir, data_split, train_volumes, test_volumes, f, data_id, remap_config='Neo', orientation=preprocessor.ORIENTATION['coronal']): # Data splitting if data_split: train_file_paths, test_file_paths = apply_split( data_split, data_dir, label_dir) elif train_volumes and test_volumes: train_file_paths = du.load_file_paths(data_dir, label_dir, data_id, train_volumes) test_file_paths = du.load_file_paths(data_dir, label_dir, data_id, test_volumes) else: raise ValueError( 'You must either provide the split ratio or a train, train dataset list' ) reduce_slices = False #True #BORIS print("Train dataset size: %d, Test dataset size: %d" % (len(train_file_paths), len(test_file_paths))) # loading,pre-processing and writing train data print("===Train data===") data_train, label_train, class_weights_train, weights_train, _ = du.load_dataset( train_file_paths, orientation, remap_config=remap_config, return_weights=True, reduce_slices=reduce_slices, #BORIS remove_black=True) _write_h5(data_train, label_train, class_weights_train, weights_train, f, mode='train') # loading,pre-processing and writing test data print("===Test data===") data_test, label_test, class_weights_test, weights_test, _ = du.load_dataset( test_file_paths, orientation, remap_config=remap_config, return_weights=True, reduce_slices=reduce_slices, #BORIS remove_black=True) _write_h5(data_test, label_test, class_weights_test, weights_test, f, mode='test')
def main(): print("\nParameters:") for attr, value in args.__dict__.items(): print("\t{}={}".format(attr.upper(), value)) # load data strain_data, sd_train_data, sdev_data, stest_data, embeddings =\ data_utils.load_dataset(args, 'askubuntu-master', dtrain=True) dtrain_data, ddev_data, dtest_data, _ =\ data_utils.load_dataset(args, 'Android-master') # initalize necessary parameters args.embed_num = embeddings.shape[0] args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')] # load model if args.snapshot is None: # initalize model task_model = None if args.model == 'lstm': if args.bidirectional and (args.hidden_layer > 1): args.hidden_layer = 1 print('\nMultilayer bidirectional LSTM not supported yet,\ layer set to 1.\n') task_model = model.LSTM(args, embeddings) elif args.model == 'cnn': task_model = model.CNN(args, embeddings) domain_model = model.DomainClassifier(args, embeddings) # train models res = train2.train_model(strain_data, sd_train_data, sdev_data, stest_data, dtrain_data, ddev_data, dtest_data, task_model, domain_model, args) else: print('\nLoading model from [%s]...' % args.snapshot) try: mod = torch.load(args.snapshot) except: print("Sorry, This snapshot doesn't exist.") exit() print(mod) # evaluate print('\nEvaluating on target dev') evaluate.q_evaluate(mod, ddev_data, args) print('Evaluating on target test') evaluate.q_evaluate(mod, dtest_data, args)
def _cross_val(dataset='mauna_loa', k=10, dist_metric='l1', v=5): """ cross validation technique on knn Inputs: dataset: (str) name of dataset k: (list) k[0]:lower bound of number of nearest neighbours; k[1]:upper bound of number of nearest neighbours dist_metric: (str) 'l1' or 'l2' v: (int) cross validation parameter, number of cross folds Outputs: averaged validation loss """ print('------Processing Dataset ' + dataset + ' ------') if dataset == 'rosenbrock': x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset( 'rosenbrock', n_train=5000, d=2) else: x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset( dataset) x_train = np.vstack([x_valid, x_train]) y_train = np.vstack([y_valid, y_train]) np.random.seed(42) np.random.shuffle(x_train) np.random.seed(42) np.random.shuffle(y_train) data_partition = _partition_fold(v=v, data=x_train) loss = np.empty((0, k[1] - k[0])) for fold in range(v): print('------Processing Fold ' + str(fold + 1) + ' ------') train_x = np.delete(x_train, list(data_partition[fold]), axis=0) train_y = np.delete(y_train, list(data_partition[fold]), axis=0) query_x = np.take(x_train, list(data_partition[fold]), axis=0) query_y = np.take(y_train, list(data_partition[fold]), axis=0) curr_loss = _eval_knn(k, train_x, train_y, query_x, query_y, dist_metric=dist_metric) loss = np.append(loss, [curr_loss], axis=0) loss = loss.mean(axis=0) return loss
def main(_): data_path = 'data/new-dataset-cornell-length10-filter1-vocabSize40000.pkl' word2id, id2word, trainingSamples = load_dataset(data_path) hparam = Config() hparam.is_training = False with tf.Session() as sess: model = Seq2SeqModel(hparam, word2id) ckpt = tf.train.get_checkpoint_state(hparam.save_path) if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): print("Restoring model parameters from %s." % ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) else: print("Creating model with fresh parameters.") sess.run(model.init) sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: batch = sentence_preprocess(sentence, word2id) outputs = model.infer_session(sess, batch) predicted_ids = outputs["predicted_ids"] out_sents = [id2word[idx] for idx in predicted_ids[0][0].tolist()] print(" ".join(out_sents)) print("> ", "") sys.stdout.flush() sentence = sys.stdin.readline()
def evaluate_model(model_path, dataset_path='emnist/emnist-balanced-test.csv'): raw_test_x, raw_test_y, class_map = data_utils.load_dataset(dataset_path) test_x, test_y, _ = data_utils.prepare_data(raw_test_x, raw_test_y, class_map) best_model = load_model(model_path) print(best_model.evaluate(test_x, test_y)) data_utils.print_confusion_matrix(test_x, test_y, model_path, class_map)
def _test_predict(l=0): x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset( 'mauna_loa') x_total = np.vstack([x_train, x_valid]) y_total = np.vstack([y_train, y_valid]) phi_train = _construct_phi(x_total) phi_test = _construct_phi(x_test) U, S, Vh = np.linalg.svd(phi_train) # Invert Sigma sig = np.diag(S) filler = np.zeros([phi_train.shape[0] - len(S), len(S)]) sig = np.vstack([sig, filler]) inv = np.linalg.inv(sig.T @ sig + l * np.eye(sig.shape[1])) w = Vh.T @ inv @ sig.T @ (U.T @ y_total) prediction = phi_test @ w plot(xlabel='x', ylabel='y', name='mauna_loa_predict', x=x_test, y=[prediction, y_test], legend=['Predicted', 'GroundTruth']) return _RMSE(prediction, y_test)
def predict_test(dataset='mauna_loa', k=2, dist_metric='l2'): """ run knn and output predicted values on regression test data Inputs: dataset: (str) name of dataset k: (list) k[0]:lower bound of number of nearest neighbours; k[1]:upper bound of number of nearest neighbours dist_metric: (str) 'l1' or 'l2' Outputs: [predict_x,GroundTruth_y,predicted_y] """ x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(dataset) x_train = np.vstack([x_valid, x_train]) y_train = np.vstack([y_valid, y_train]) predicted_y = np.empty((0, y_train.shape[-1])) curr_predict = _eval_knn([k, k + 1], x_train, y_train, x_test, y_test, dist_metric=dist_metric, compute_loss=False) predicted_y = np.append(predicted_y, curr_predict['k=' + str(k)], axis=0) rval = [] for idx in range(x_test.shape[0]): rval.append((x_test[idx], y_test[idx], predicted_y[idx])) rval.sort(key=lambda tup: tup[0]) return [i[0] for i in rval], [i[1] for i in rval], [i[2] for i in rval]
def run_Q5(): theta_list, test_loss = [0.01, 0.1, 1.0], [] #theta_list, test_loss = [1.0], [] x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset( 'rosenbrock', n_train=200, d=2) for theta in theta_list: print('----- Processing Theta = ' + str(theta) + '-----') I_selected, w = _greedy_alg(x_train, y_train, theta=theta) #print(I_selected) #print(w) loss_total = 0 big_K = np.empty((0, len(I_selected))) for i in range(x_test.shape[0]): build_kernel = _test_kernel(basis=I_selected, x_train=x_train, test_pt=x_test[i], theta=theta) #print(build_kernel) #break big_K = np.append(big_K, [build_kernel], axis=0) #print(big_K) predicted_y = np.dot(big_K, w) #print(predicted_y) loss = _RMSE(predicted_y, y_test) # loss_total += loss # l = loss_total/x_test.shape[0] # test_loss.append(l) #break print('Test Loss: ' + str(loss)) return loss
def run_example(): """ This example demonstrates computation of the negative log likelihood (nll) as well as the gradient of the nll with respect to all weights and biases of the neural network. We will use 50 neurons per hidden layer and will initialize all weights and biases to zero. """ # load the MNIST_small dataset from data_utils import load_dataset x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset( 'mnist_small') # initialize the weights and biases of the network M = 50 # 50 neurons per hidden layer W1 = np.zeros((M, 784)) # weights of first (hidden) layer W2 = np.zeros((M, M)) # weights of second (hidden) layer W3 = np.zeros((10, M)) # weights of third (output) layer b1 = np.zeros((M, 1)) # biases of first (hidden) layer b2 = np.zeros((M, 1)) # biases of second (hidden) layer b3 = np.zeros((10, 1)) # biases of third (output) layer # considering the first 250 points in the training set, # compute the negative log likelihood and its gradients (nll, (W1_grad, W2_grad, W3_grad, b1_grad, b2_grad, b3_grad)) = \ nll_gradients(W1, W2, W3, b1, b2, b3, x_train[:250], y_train[:250]) print("negative log likelihood: %.5f" % nll)
def run_Q3(l=0.1): x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset( 'mauna_loa') x_total = np.vstack([x_train, x_valid]) y_total = np.vstack([y_train, y_valid]) K = _Q3_construct_K(x_total) R = np.linalg.cholesky((K + l * np.eye(len(K)))) #print(K) #print(K.shape) R_inv = np.linalg.inv(R) alpha = R_inv.T @ R_inv @ y_total K_test = _Q3_construct_test_K(x_total, x_test) prediction = K_test @ alpha plot(xlabel='x', ylabel='y', name='mauna_loa_predict_CH', x=x_test, y=[prediction, y_test], legend=['Predicted', 'GroundTruth']) z = np.linspace(-0.1, 0.1, 100) x = [0] * len(z) _visualize_kernel(x, z, 'k(0,z)') z = np.linspace(-0.1 + 1, 0.1 + 1, 100) x = [1] * len(z) _visualize_kernel(x, z, 'k(1,z+1)') return _RMSE(prediction, y_test)
def model_06(): # 加载数据集 X_train, Y_train, X_test, Y_test = load_dataset() # 数据 # 设置参数 layers_dims = [X_train.shape[0], 1] num_iter = 2000 learning_rate = 0.5 print_cost = False initialization = "he" parameters, costs = basic_model(X_train, Y_train, layers_dims=layers_dims, num_iter=num_iter, lr=learning_rate, print_cost=print_cost, initialization=initialization) # 预测及评估 prediction_train = predict(parameters, X_train) prediction_test = predict(parameters, X_test) print("Train准确率: {}".format(evaluate(prediction_train, Y_train))) print("test准确率: {}".format(evaluate(prediction_test, Y_test))) plt.title("Model with He initialization") axes = plt.gca() axes.set_xlim([-1.5, 1.5]) axes.set_ylim([-1.5, 1.5]) plot_decision_boundary(lambda x: predict(parameters, x.T), X_train, Y_train) plt.show()
def train(self, epoch=25, batch_size=1, learning_rate=0.0002, momentum=0.9, decay=0.95, data_dir="data", dataset_name="cnn", vocab_size=1000000): if not self.vocab: self.vocab, self.rev_vocab = load_vocab(data_dir, dataset_name, vocab_size) self.opt = tf.train.RMSPropOptimizer(learning_rate, decay=decay, momentum=momentum) for epoch_idx in xrange(epoch): data_loader = load_dataset(data_dir, dataset_name, vocab_size) contexts, questions, answers = [], [], [] for batch_idx in xrange(batch_size): _, context, question, answer, _ = data_loader.next() contexts.append(context) questions.append(question) answers.append(answers)
def main(): config = get_config() config = init_env(config) datasets = data_utils.load_dataset(config) eval_metric = FewShotMetrics(config, datasets) if config.eval: model = Model.load(config, config.load_checkpoint) else: if config.load_checkpoint: model = Model.load(config, config.load_checkpoint) else: word_dict = datasets['train'].word_dict classes = datasets['train'].classes model = Model(config, word_dict, classes) model.train(datasets['train'], datasets['dev'], eval_metric) model.load_best() test_loader = data_utils.get_dataset_loader(config, datasets['test'], train=False) evaluate(config, model, test_loader, eval_metric, split='test', dump=not config.eval)
def main(_): data_path = 'data/new-dataset-cornell-length10-filter1-vocabSize40000.pkl' word2id, id2word, trainingSamples = load_dataset(data_path) hparam = Config() with tf.Session() as sess: model = Seq2SeqModel(hparam, word2id) ckpt = tf.train.get_checkpoint_state(hparam.save_path) if FLAGS.resume and ckpt and tf.train.checkpoint_exists( ckpt.model_checkpoint_path): print("Restoring model parameters from %s." % ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) else: print("Creating model with fresh parameters.") sess.run(model.init) train_writer = tf.summary.FileWriter(hparam.save_path, graph=sess.graph) for epoch in range(hparam.num_epoch): print("Starting Epoch {}/{}:".format(epoch, hparam.num_epoch)) batches = get_batches(trainingSamples, hparam.batch_size) total_loss = 0.0 total_count = 0 for nextBatch in tqdm(batches, desc="training"): outputs = model.train_session(sess, nextBatch) loss = outputs["loss"] summary = outputs["summary"] step = outputs["step"] train_writer.add_summary(summary, step) total_loss += loss total_count += 1 if step % hparam.display_per_step == 0: perplexity = math.exp( float(total_loss / total_count) ) if total_loss / total_count < 300 else float('inf') tqdm.write( " Step %d | Per-word Loss %.4f | Perplexity %.4f" % (step, total_loss / total_count, perplexity)) checkpoint_path = os.path.join(hparam.save_path, hparam.model_name) model.saver.save(sess, checkpoint_path) tqdm.write("\n") tqdm.write(" Epoch %d | Per-word Loss %.4f | Perplexity %.4f" % (epoch, total_loss / total_count, perplexity)) tqdm.write("\n")
def log_reg_GD(dataset='iris', lr_rates=[0.1], method='SGD', total_iter=2000): x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(dataset) y_train, y_valid, y_test = y_train[:, (1, )], y_valid[:, (1, )], y_test[:, (1, )] y_train, y_valid, y_test = _cast_TF(y_train), _cast_TF(y_valid), _cast_TF( y_test) x_train = np.vstack([x_train, x_valid]) y_train = np.vstack([y_train, y_valid]) X = np.ones((len(x_train), len(x_train[0]) + 1)) X[:, 1:] = x_train X_test = np.ones((len(x_test), len(x_test[0]) + 1)) X_test[:, 1:] = x_test test_accuracies = [] test_logs = [] neg_log = {} for rate in lr_rates: w = np.zeros(np.shape(X[0, :])) neg_log[rate] = [] bar = tqdm.tqdm(total=total_iter, desc='Iter', position=0) for iteration in range(total_iter): bar.update(1) estimates = X @ w estimates = estimates.reshape(np.shape(y_train)) if method == 'SGD': i = random.randint(0, len(y_train) - 1) grad_L = (y_train[i] - _sigmoid(estimates[i])) * X[i, :] elif method == 'GD': grad_L = np.zeros(np.shape(w)) for i in range(len(y_train)): grad_L += (y_train[i] - _sigmoid(estimates[i])) * X[i, :] w = w + (rate * grad_L) L = _log_likelihood(estimates, y_train) neg_log[rate].append(-L) test_estimates = np.dot(X_test, w) test_estimates = test_estimates.reshape(np.shape(y_test)) predictions = np.zeros(np.shape(y_test)) for i in range(len(predictions)): p = _sigmoid(test_estimates[i]) predictions[i] = (p >= 1 / 2) test_accuracies.append(_Q1_compute_acc(y_test, predictions)) test_logs.append(_log_likelihood(test_estimates, y_test)) return neg_log, test_accuracies, test_logs
def test(): parser = argparse.ArgumentParser() parser.add_argument('--target', choices=['vitB1', 'vitB12', 'folate']) parser.add_argument('--modelType', choices=['lr', 'svc', 'rf', 'knn']) parser.add_argument('--reverse', action='store_true') opt = parser.parse_args() # threshold th_dict = dict() th_dict['vitB1'] = 30 th_dict['vitB12'] = 180 th_dict['folate'] = 4 # load the dataset x_df, y_df, date = data_utils.load_dataset(target=opt.target) # preprocess the dataset x_data, y_data, weight = data_utils.preprocess_dataset(x_df, y_df, th=th_dict[opt.target]) # split into train and test n_train = np.sum(date < 20170000) if opt.reverse: x_data, y_data = x_data[::-1], y_data[::-1] x_data, x_test, y_data, y_test = train_test_split(x_data, y_data, train_size=n_train, shuffle=False) # model if opt.modelType == 'lr': model = LogisticRegression(C=1e1, random_state=42, class_weight={1: weight}) elif opt.modelType == 'svc': model = SVC(kernel='rbf', C=1e6, gamma=1e-9, class_weight={1: weight}, probability=True, random_state=42) elif opt.modelType == 'rf': model = RandomForestClassifier(n_estimators=50, min_samples_split=2, max_depth=10, class_weight={1: weight}, random_state=42) elif opt.modelType == 'knn': model = KNeighborsClassifier(algorithm='auto', leaf_size=1, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=37, p=1, weights='uniform') # fit and predict model.fit(x_data, y_data) prob_test = model.predict_proba(x_test)[:, 1] # evaluation auc_value = roc_auc_score(y_test, prob_test) print('AUC: {:.4f}'.format(auc_value)) draw_roc(y_test, prob_test, opt.modelType)
def TimeTaken(d): x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset( 'rosenbrock', n_train=5000, d=d) print(np.shape(x_test[0])) print(np.shape(x_train[0])) time_init = time.time() Test_Error_Tree(x_train, y_train, x_test, y_test) time_end = time.time() return time_end - time_init
def main(_): pp.pprint(flags.FLAGS.__flags) if not FLAGS.model_dir: print(" [-] Error: Model dir is not set!") exit(-1) if not os.path.exists(FLAGS.model_dir): print(" [*] Creating model directory...") os.makedirs(FLAGS.model_dir) with open(os.path.join(FLAGS.model_dir, "config.json"), 'w') as config_file: config_file.write("%s" % (pp.pformat(flags.FLAGS.__flags))) # build model model = model_dict[FLAGS.model](vocab_size=FLAGS.vocab_size, size=FLAGS.cell_size, cell_type=FLAGS.cell) # load data print(" [*] Loading dataset...") train_data = data_utils.load_dataset(FLAGS.data_dir, FLAGS.dataset, FLAGS.vocab_size, FLAGS.max_nsteps, part="training") dev_data = data_utils.load_dataset(FLAGS.data_dir, FLAGS.dataset, FLAGS.vocab_size, FLAGS.max_nsteps, part="validation") print(" [+] Finish loading. Train set: %d, Dev set: %d" % (len(train_data), len(dev_data))) #model.train(train_data, dev_data, nb_epoch=FLAGS.epoch, batch_size=FLAGS.batch_size, model_dir=FLAGS.model_dir) model.batch_train(train_data, dev_data, nb_epoch=FLAGS.epoch, batch_size=FLAGS.batch_size, model_dir=FLAGS.model_dir, evaluate_every=FLAGS.evaluate_every, checkpoint_every=FLAGS.checkpoint_every)
def convert_h5(data_dir, label_dir, data_split, f): if data_split: train_file_paths, test_file_paths = apply_split( data_split, data_dir, label_dir) else: raise ValueError('Please provide the split ratio') print("Training dataset size: ", len(train_file_paths)) print("Testing dataset size: ", len(test_file_paths)) # data_train = list of 3D numpy array of training volumes # label_train = list of 3D numpy array of training labels # _ = list of header of training volumes print("Loading and pre-processing Training data...") data_train, label_train, _ = du.load_dataset(train_file_paths) _write_h5(data_train, label_train, f, mode="train") print("Loading and pre-processing Testing data...") data_test, label_test, _ = du.load_dataset(test_file_paths) _write_h5(data_test, label_test, f, mode="test")
def _svd_classification(dataset='mnist_small'): """ svd on classificaiton dataset Inputs: dataset: (str) name of dataset Outputs: accuracy on predicted values """ if dataset == 'rosenbrock': x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset( 'rosenbrock', n_train=5000, d=2) else: x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset( dataset) x_total = np.vstack([x_train, x_valid]) y_total = np.vstack([y_train, y_valid]) X = np.ones((len(x_total), len(x_total[0]) + 1)) X[:, 1:] = x_total U, S, Vh = np.linalg.svd(X) # Invert Sigma sig = np.diag(S) filler = np.zeros([len(x_total) - len(S), len(S)]) sig_inv = np.linalg.pinv(np.vstack([sig, filler])) # Compute weights w = Vh.T @ (sig_inv @ (U.T @ y_total)) # Make test predictions X_test = np.ones((len(x_test), len(x_test[0]) + 1)) X_test[:, 1:] = x_test predictions = np.argmax(X_test @ w, axis=1) y_test = np.argmax(1 * y_test, axis=1) return (predictions == y_test).sum() / len(y_test)
def _svd_regression(dataset='mauna_loa'): """ svd on regression dataset Inputs: dataset: (str) name of dataset Outputs: RMSE on predicted values """ if dataset == 'rosenbrock': x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset( 'rosenbrock', n_train=5000, d=2) else: x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset( dataset) x_total = np.vstack([x_train, x_valid]) y_total = np.vstack([y_train, y_valid]) X = np.ones((len(x_total), len(x_total[0]) + 1)) X[:, 1:] = x_total U, S, Vh = np.linalg.svd(X) # Invert Sigma sig = np.diag(S) filler = np.zeros([len(x_total) - len(S), len(S)]) sig_inv = np.linalg.pinv(np.vstack([sig, filler])) # Compute weights w = Vh.T @ (sig_inv @ (U.T @ y_total)) # Make test predictions X_test = np.ones((len(x_test), len(x_test[0]) + 1)) X_test[:, 1:] = x_test predictions = X_test @ w return _RMSE(y_test, predictions)
def predict_cross_val(dataset='mauna_loa', k=2, dist_metric='l2', v=5): """ cross validation technique on knn and output predicted values Inputs: dataset: (str) name of dataset k: (list) k[0]:lower bound of number of nearest neighbours; k[1]:upper bound of number of nearest neighbours dist_metric: (str) 'l1' or 'l2' v: (int) cross validation parameter, number of cross folds Outputs: [predict_x,GroundTruth_y,predicted_y] """ x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(dataset) x_train = np.vstack([x_valid, x_train]) y_train = np.vstack([y_valid, y_train]) np.random.seed(42) np.random.shuffle(x_train) np.random.seed(42) np.random.shuffle(y_train) data_partition = _partition_fold(v=v, data=x_train) predicted_y = np.empty((0, y_train.shape[-1])) for fold in range(v): print('------Processing Fold ' + str(fold + 1) + ' ------') train_x = np.delete(x_train, data_partition[fold], axis=0) train_y = np.delete(y_train, data_partition[fold], axis=0) query_x = np.take(x_train, data_partition[fold], axis=0) query_y = np.take(y_train, data_partition[fold], axis=0) curr_predict = _eval_knn([k, k + 1], train_x, train_y, query_x, query_y, dist_metric=dist_metric, compute_loss=False) #print(curr_predict.shape) predicted_y = np.append(predicted_y, curr_predict['k=' + str(k)], axis=0) rval = [] for idx in range(x_train.shape[0]): rval.append((x_train[idx], y_train[idx], predicted_y[idx])) rval.sort(key=lambda tup: tup[0]) return [i[0] for i in rval], [i[1] for i in rval], [i[2] for i in rval]
def question1a(): x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset('iris') y_train, y_valid, y_test = y_train[:, (1,)], y_valid[:, (1,)], y_test[:, (1,)] learningRate = 0.0001 maxIterations = 1000 x_train, x_test = np.vstack((x_train, x_valid)), x_test y_train, y_test = np.vstack((y_train, y_valid)), y_test varianceList = [0.5, 1, 2] print("\nResults for question 1:\n") for variance in varianceList: margLikelihood, iterations, w, H = laplaceApproximation(x_train, x_test, y_train, y_test, learningRate, variance, maxIterations) print("For a variance of {}:".format(variance)) print("Iterations = {}".format(iterations)) print("Marginal log likelihood = {}\n".format(margLikelihood))
def load_initial_dataset(): dataset_folder = Path("../datasets/") try: # Try to load a cached version of the dataframe print("Trying to load the cached dataframe...") df = pd.read_pickle(dataset_folder / 'cached_dataframe.pkl2') print("Done") except: print("No cached dataframe, loading the dataset from disk") path_file = dataset_folder / 'Cell_Phones_and_Accessories_5.json' df = load_dataset(path_file) # Store the dataframe on disk print("Caching the dataframe") df.to_pickle(dataset_folder / 'cached_dataframe.pkl2') return df
def loadData(datasetName, d=2): ''' Loads the dataset and normalize the x_ sets INPUT: datasetName: a string of the name of file to be loaded. Note that this file must be in the same path as this file OUTPUT: 6 datasets in array form, 3 of which are normalized x data ''' if datasetName == 'rosenbrock': x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset( datasetName, n_train=1000, d=d) else: x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset( datasetName) x_all = np.concatenate([x_train, x_valid]) y_all = np.concatenate([y_train, y_valid]) index_all = list(range(np.shape(x_all)[0])) random.shuffle(index_all) # Normalizetion of each x data mean = x_all.mean(axis=0, keepdims=True) stddev = x_all.std(axis=0, keepdims=True) x_all = normalization(x_all, mean, stddev) x_test = normalization(x_test, mean, stddev) return index_all, x_all, x_test, y_all, y_test
def _kd_tree(dataset='rosenbrock', dist_metric='l2', k=5, d=2): """ knn using kd_tree Inputs: dataset: (str) name of dataset k: (int) number of nearest neighbours dist_metric: (str) 'l1' or 'l2' d: (int) data dimensionality Outputs: RMSE on predicted values """ x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset( dataset, n_train=5000, d=d) kdt = neighbors.KDTree(x_train) _, index = kdt.query(x_test, k=k) predictions = np.sum(y_train[index], axis=1) / k return _RMSE(y_test, predictions)
def _test_classification(dataset='iris', k_range=[1, 2], dist_metric='l1'): """ run knn and output predicted values on classificaiton test data Inputs: dataset: (str) name of dataset k_range: (list) k[0]:lower bound of number of nearest neighbours; k[1]:upper bound of number of nearest neighbours dist_metric: (str) 'l1' or 'l2' Outputs: accuracy of predicted values referred to GroundTruth """ print('------Processing Dataset ' + dataset + ' ------') x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(dataset) if y_train.dtype == np.dtype('bool'): y_train = _cast_TF(y_train) y_valid = _cast_TF(y_valid) y_test = _cast_TF(y_test) acc = [] predicted = _eval_knn(k_range, x_train, y_train, x_test, y_test, dist_metric, compute_loss=False) for k in range(k_range[0], k_range[1]): curr_predict = predicted['k=' + str(k)] result = np.argmax(curr_predict, axis=1) gt = np.where(y_test == True, 1, 0) gt = np.argmax(gt, axis=1) #print(result-gt) #break unique, counts = np.unique(result - gt, return_counts=True) correct = dict(zip(unique, counts))[0] acc.append(correct / y_test.shape[0]) return acc
def create_test_train_fold(fold_num): """Splits the dataset into training and held-out test set.""" data_x, data_y, _ = data_utils.load_dataset(FLAGS.dataset_name) tf.logging.info('Dataset: %s, Size: %d', FLAGS.dataset_name, data_x.shape[0]) tf.logging.info('Cross-val fold: %d/%d', FLAGS.fold_num, _N_FOLDS) # Get the training and test set based on the StratifiedKFold split (x_train_all, y_train_all), test_dataset = data_utils.get_train_test_fold( data_x, data_y, fold_num=fold_num, num_folds=_N_FOLDS, stratified=not FLAGS.regression) data_gen = data_utils.split_training_dataset( x_train_all, y_train_all, FLAGS.num_splits, stratified=not FLAGS.regression) return data_gen, test_dataset
def run_Q1a(dataset='iris', lr=0.001): x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(dataset) y_train, y_valid, y_test = y_train[:, (1, )], y_valid[:, (1, )], y_test[:, (1, )] x_train, x_test = np.vstack((x_train, x_valid)), x_test y_train, y_test = np.vstack((y_train, y_valid)), y_test var_list = [0.5, 1, 2] X_train = _generate_X(x_train) X_test = _generate_X(x_test) marginal_likelihoods, rval_w = {}, None for variance in var_list: w = np.zeros(np.shape(X_train[0])) x_prod = np.reshape(X_train @ w, np.shape(y_train)) posterior_grad = _likelihood_grad(X_train, x_prod, y_train) + _prior_grad(w, variance) while 1: if max(posterior_grad) < 10**(-2): break x_prod = X_train @ w posterior_grad = _likelihood_grad( X_train, x_prod, y_train) + _prior_grad(w, variance) w = w + (lr * posterior_grad) hessian = _likelihood_hess(X_train, x_prod) + _prior_hess(w, variance) marginal_likelihoods[variance] = _log_likelihood( x_prod, y_train) + _log_prior(w, variance) - _log_g(hessian) if variance == 1: rval_w = w print(marginal_likelihoods) print(rval_w) return marginal_likelihoods, rval_w
def train( self, sess, vocab_size, epoch=25, learning_rate=0.0002, momentum=0.9, decay=0.95, data_dir="data", dataset_name="cnn", ): self.prepare_model(data_dir, dataset_name, vocab_size) start = time.clock() print(" [*] Calculating gradient and loss...") self.optim = tf.train.AdamOptimizer(learning_rate, 0.9).minimize(self.loss) print(" [*] Calculating gradient and loss finished. Take %.2fs" % (time.clock() - start)) # Could not use RMSPropOptimizer because the sparse update of RMSPropOptimizer # is not implemented yet (2016.01.24). # self.optim = tf.train.RMSPropOptimizer(learning_rate, # decay=decay, # momentum=momentum).minimize(self.loss) sess.run(tf.initialize_all_variables()) if self.load(sess, self.checkpoint_dir, dataset_name): print(" [*] Deep LSTM checkpoint is loaded.") else: print(" [*] There is no checkpoint for this model.") y = np.zeros([self.batch_size, self.vocab_size]) merged = tf.merge_all_summaries() writer = tf.train.SummaryWriter("/tmp/deep", sess.graph_def) counter = 0 start_time = time.time() for epoch_idx in xrange(epoch): data_loader = load_dataset(data_dir, dataset_name, vocab_size) batch_stop = False while True: y.fill(0) inputs, nstarts, answers = [], [], [] batch_idx = 0 while True: try: (_, document, question, answer, _), data_idx, data_max_idx = data_loader.next() except StopIteration: batch_stop = True break # [0] means splitter between d and q data = ( [int(d) for d in document.split()] + [0] + [int(q) for q in question.split() for q in question.split()] ) if len(data) > self.max_nsteps: continue inputs.append(data) nstarts.append(len(inputs[-1]) - 1) y[batch_idx][int(answer)] = 1 batch_idx += 1 if batch_idx == self.batch_size: break if batch_stop: break FORCE = False if FORCE: inputs = array_pad(inputs, self.max_nsteps, pad=-1, force=FORCE) nstarts = np.where(inputs == -1)[1] inputs[inputs == -1] = 0 else: inputs = array_pad(inputs, self.max_nsteps, pad=0) nstarts = [[nstart, idx, 0] for idx, nstart in enumerate(nstarts)] _, summary_str, cost, accuracy = sess.run( [self.optim, merged, self.loss, self.accuracy], feed_dict={self.inputs: inputs, self.nstarts: nstarts, self.y: y}, ) if counter % 10 == 0: writer.add_summary(summary_str, counter) print( "Epoch: [%2d] [%4d/%4d] time: %4.4f, loss: %.8f, accuracy: %.8f" % (epoch_idx, data_idx, data_max_idx, time.time() - start_time, np.mean(cost), accuracy) ) counter += 1 self.save(sess, self.checkpoint_dir, dataset_name)