Python load_dataset Exemples, data_utils.load_dataset Python Exemples

Exemple #1

0

Afficher le fichier

def _test_regression(dataset='mauna_loa', k=1, dist_metric='l2', d=2):
    """
    compute test loss on regression dataset

    Inputs:
        dataset: (str) name of dataset
        k: (int) number of nearest neighbours to test on
        dist_metric: (str) 'l1' or 'l2'
        d : (int, optional) if name='rosenbrock' the specify the dataset dimensionality

    Outputs:
        RMSE on test set of the dataset
    """

    if dataset == 'rosenbrock':
        x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(
            'rosenbrock', n_train=5000, d=d)
    else:
        x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(
            dataset)

    x_train = np.vstack([x_valid, x_train])
    y_train = np.vstack([y_valid, y_train])
    return _eval_knn([k, k + 1],
                     x_train,
                     y_train,
                     x_test,
                     y_test,
                     dist_metric,
                     compute_loss=True)

Exemple #2

0

Afficher le fichier

def convert_h5(data_dir,
               label_dir,
               data_split,
               train_volumes,
               test_volumes,
               f,
               data_id,
               remap_config='Neo',
               orientation=preprocessor.ORIENTATION['coronal']):
    # Data splitting
    if data_split:
        train_file_paths, test_file_paths = apply_split(
            data_split, data_dir, label_dir)
    elif train_volumes and test_volumes:
        train_file_paths = du.load_file_paths(data_dir, label_dir, data_id,
                                              train_volumes)
        test_file_paths = du.load_file_paths(data_dir, label_dir, data_id,
                                             test_volumes)
    else:
        raise ValueError(
            'You must either provide the split ratio or a train, train dataset list'
        )

    reduce_slices = False  #True  #BORIS

    print("Train dataset size: %d, Test dataset size: %d" %
          (len(train_file_paths), len(test_file_paths)))
    # loading,pre-processing and writing train data
    print("===Train data===")
    data_train, label_train, class_weights_train, weights_train, _ = du.load_dataset(
        train_file_paths,
        orientation,
        remap_config=remap_config,
        return_weights=True,
        reduce_slices=reduce_slices,  #BORIS
        remove_black=True)

    _write_h5(data_train,
              label_train,
              class_weights_train,
              weights_train,
              f,
              mode='train')

    # loading,pre-processing and writing test data
    print("===Test data===")
    data_test, label_test, class_weights_test, weights_test, _ = du.load_dataset(
        test_file_paths,
        orientation,
        remap_config=remap_config,
        return_weights=True,
        reduce_slices=reduce_slices,  #BORIS
        remove_black=True)

    _write_h5(data_test,
              label_test,
              class_weights_test,
              weights_test,
              f,
              mode='test')

Exemple #3

0

Afficher le fichier

Fichier : main2.py Projet : nslonge/final_project

def main():
    print("\nParameters:")
    for attr, value in args.__dict__.items():
        print("\t{}={}".format(attr.upper(), value))

    # load data
    strain_data, sd_train_data, sdev_data, stest_data, embeddings =\
                   data_utils.load_dataset(args, 'askubuntu-master', dtrain=True)
    dtrain_data, ddev_data, dtest_data, _ =\
                   data_utils.load_dataset(args, 'Android-master')

    # initalize necessary parameters
    args.embed_num = embeddings.shape[0]
    args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')]

    # load model
    if args.snapshot is None:
        # initalize model
        task_model = None
        if args.model == 'lstm':
            if args.bidirectional and (args.hidden_layer > 1):
                args.hidden_layer = 1
                print('\nMultilayer bidirectional LSTM not supported yet,\
                            layer set to 1.\n')
            task_model = model.LSTM(args, embeddings)
        elif args.model == 'cnn':
            task_model = model.CNN(args, embeddings)

        domain_model = model.DomainClassifier(args, embeddings)

        # train models
        res = train2.train_model(strain_data, sd_train_data, sdev_data,
                                 stest_data, dtrain_data, ddev_data,
                                 dtest_data, task_model, domain_model, args)
    else:
        print('\nLoading model from [%s]...' % args.snapshot)
        try:
            mod = torch.load(args.snapshot)
        except:
            print("Sorry, This snapshot doesn't exist.")
            exit()
        print(mod)

        # evaluate

        print('\nEvaluating on target dev')
        evaluate.q_evaluate(mod, ddev_data, args)

        print('Evaluating on target test')
        evaluate.q_evaluate(mod, dtest_data, args)

Exemple #4

0

Afficher le fichier

def _cross_val(dataset='mauna_loa', k=10, dist_metric='l1', v=5):
    """
    cross validation technique on knn

    Inputs:
        dataset: (str) name of dataset
        k: (list) k[0]:lower bound of number of nearest neighbours; k[1]:upper bound of number of nearest neighbours
        dist_metric: (str) 'l1' or 'l2'
        v: (int) cross validation parameter, number of cross folds

    Outputs:
        averaged validation loss
    """
    print('------Processing Dataset ' + dataset + ' ------')
    if dataset == 'rosenbrock':
        x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(
            'rosenbrock', n_train=5000, d=2)
    else:
        x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(
            dataset)

    x_train = np.vstack([x_valid, x_train])
    y_train = np.vstack([y_valid, y_train])

    np.random.seed(42)
    np.random.shuffle(x_train)
    np.random.seed(42)
    np.random.shuffle(y_train)

    data_partition = _partition_fold(v=v, data=x_train)
    loss = np.empty((0, k[1] - k[0]))
    for fold in range(v):
        print('------Processing Fold ' + str(fold + 1) + ' ------')
        train_x = np.delete(x_train, list(data_partition[fold]), axis=0)
        train_y = np.delete(y_train, list(data_partition[fold]), axis=0)

        query_x = np.take(x_train, list(data_partition[fold]), axis=0)
        query_y = np.take(y_train, list(data_partition[fold]), axis=0)

        curr_loss = _eval_knn(k,
                              train_x,
                              train_y,
                              query_x,
                              query_y,
                              dist_metric=dist_metric)
        loss = np.append(loss, [curr_loss], axis=0)

    loss = loss.mean(axis=0)
    return loss

Exemple #5

0

Afficher le fichier

Fichier : inferer.py Projet : zheng-yanan/collection-of-memory-attention-mechanisms

def main(_):
    data_path = 'data/new-dataset-cornell-length10-filter1-vocabSize40000.pkl'
    word2id, id2word, trainingSamples = load_dataset(data_path)
    hparam = Config()
    hparam.is_training = False

    with tf.Session() as sess:

        model = Seq2SeqModel(hparam, word2id)
        ckpt = tf.train.get_checkpoint_state(hparam.save_path)
        if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
            print("Restoring model parameters from %s." %
                  ckpt.model_checkpoint_path)
            model.saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            print("Creating model with fresh parameters.")
            sess.run(model.init)

        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence:
            batch = sentence_preprocess(sentence, word2id)
            outputs = model.infer_session(sess, batch)

            predicted_ids = outputs["predicted_ids"]
            out_sents = [id2word[idx] for idx in predicted_ids[0][0].tolist()]
            print(" ".join(out_sents))
            print("> ", "")
            sys.stdout.flush()
            sentence = sys.stdin.readline()

Exemple #6

0

Afficher le fichier

def evaluate_model(model_path, dataset_path='emnist/emnist-balanced-test.csv'):
    raw_test_x, raw_test_y, class_map = data_utils.load_dataset(dataset_path)
    test_x, test_y, _ = data_utils.prepare_data(raw_test_x, raw_test_y,
                                                class_map)
    best_model = load_model(model_path)
    print(best_model.evaluate(test_x, test_y))
    data_utils.print_confusion_matrix(test_x, test_y, model_path, class_map)

Exemple #7

0

Afficher le fichier

Fichier : glm_main.py Projet : thomas-enxuli/GLM

def _test_predict(l=0):
    x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(
        'mauna_loa')
    x_total = np.vstack([x_train, x_valid])
    y_total = np.vstack([y_train, y_valid])

    phi_train = _construct_phi(x_total)
    phi_test = _construct_phi(x_test)

    U, S, Vh = np.linalg.svd(phi_train)

    # Invert Sigma
    sig = np.diag(S)
    filler = np.zeros([phi_train.shape[0] - len(S), len(S)])
    sig = np.vstack([sig, filler])

    inv = np.linalg.inv(sig.T @ sig + l * np.eye(sig.shape[1]))
    w = Vh.T @ inv @ sig.T @ (U.T @ y_total)

    prediction = phi_test @ w
    plot(xlabel='x',
         ylabel='y',
         name='mauna_loa_predict',
         x=x_test,
         y=[prediction, y_test],
         legend=['Predicted', 'GroundTruth'])
    return _RMSE(prediction, y_test)

Exemple #8

0

Afficher le fichier

def predict_test(dataset='mauna_loa', k=2, dist_metric='l2'):
    """
    run knn and output predicted values on regression test data

    Inputs:
        dataset: (str) name of dataset
        k: (list) k[0]:lower bound of number of nearest neighbours; k[1]:upper bound of number of nearest neighbours
        dist_metric: (str) 'l1' or 'l2'

    Outputs:
        [predict_x,GroundTruth_y,predicted_y]
    """
    x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(dataset)

    x_train = np.vstack([x_valid, x_train])
    y_train = np.vstack([y_valid, y_train])

    predicted_y = np.empty((0, y_train.shape[-1]))
    curr_predict = _eval_knn([k, k + 1],
                             x_train,
                             y_train,
                             x_test,
                             y_test,
                             dist_metric=dist_metric,
                             compute_loss=False)
    predicted_y = np.append(predicted_y, curr_predict['k=' + str(k)], axis=0)

    rval = []
    for idx in range(x_test.shape[0]):
        rval.append((x_test[idx], y_test[idx], predicted_y[idx]))

    rval.sort(key=lambda tup: tup[0])
    return [i[0] for i in rval], [i[1] for i in rval], [i[2] for i in rval]

Exemple #9

0

Afficher le fichier

Fichier : glm_main.py Projet : thomas-enxuli/GLM

def run_Q5():
    theta_list, test_loss = [0.01, 0.1, 1.0], []
    #theta_list, test_loss = [1.0], []
    x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(
        'rosenbrock', n_train=200, d=2)
    for theta in theta_list:
        print('----- Processing Theta = ' + str(theta) + '-----')
        I_selected, w = _greedy_alg(x_train, y_train, theta=theta)
        #print(I_selected)
        #print(w)
        loss_total = 0
        big_K = np.empty((0, len(I_selected)))

        for i in range(x_test.shape[0]):
            build_kernel = _test_kernel(basis=I_selected,
                                        x_train=x_train,
                                        test_pt=x_test[i],
                                        theta=theta)
            #print(build_kernel)
            #break
            big_K = np.append(big_K, [build_kernel], axis=0)
        #print(big_K)
        predicted_y = np.dot(big_K, w)
        #print(predicted_y)
        loss = _RMSE(predicted_y, y_test)
        #     loss_total += loss
        # l = loss_total/x_test.shape[0]
        # test_loss.append(l)
        #break
        print('Test Loss: ' + str(loss))
    return loss

Exemple #10

0

Afficher le fichier

Fichier : A3.py Projet : zoiehou/ROB313---Introduction-to-Learning-from-Data

def run_example():
    """
    This example demonstrates computation of the negative log likelihood (nll) as
    well as the gradient of the nll with respect to all weights and biases of the
    neural network. We will use 50 neurons per hidden layer and will initialize all 
    weights and biases to zero.
    """
    # load the MNIST_small dataset
    from data_utils import load_dataset
    x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(
        'mnist_small')

    # initialize the weights and biases of the network
    M = 50  # 50 neurons per hidden layer
    W1 = np.zeros((M, 784))  # weights of first (hidden) layer
    W2 = np.zeros((M, M))  # weights of second (hidden) layer
    W3 = np.zeros((10, M))  # weights of third (output) layer
    b1 = np.zeros((M, 1))  # biases of first (hidden) layer
    b2 = np.zeros((M, 1))  # biases of second (hidden) layer
    b3 = np.zeros((10, 1))  # biases of third (output) layer

    # considering the first 250 points in the training set,
    # compute the negative log likelihood and its gradients
    (nll, (W1_grad, W2_grad, W3_grad, b1_grad, b2_grad, b3_grad)) = \
        nll_gradients(W1, W2, W3, b1, b2, b3, x_train[:250], y_train[:250])
    print("negative log likelihood: %.5f" % nll)

Exemple #11

0

Afficher le fichier

Fichier : glm_main.py Projet : thomas-enxuli/GLM

def run_Q3(l=0.1):
    x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(
        'mauna_loa')
    x_total = np.vstack([x_train, x_valid])
    y_total = np.vstack([y_train, y_valid])

    K = _Q3_construct_K(x_total)
    R = np.linalg.cholesky((K + l * np.eye(len(K))))
    #print(K)
    #print(K.shape)

    R_inv = np.linalg.inv(R)

    alpha = R_inv.T @ R_inv @ y_total
    K_test = _Q3_construct_test_K(x_total, x_test)
    prediction = K_test @ alpha
    plot(xlabel='x',
         ylabel='y',
         name='mauna_loa_predict_CH',
         x=x_test,
         y=[prediction, y_test],
         legend=['Predicted', 'GroundTruth'])
    z = np.linspace(-0.1, 0.1, 100)
    x = [0] * len(z)
    _visualize_kernel(x, z, 'k(0,z)')
    z = np.linspace(-0.1 + 1, 0.1 + 1, 100)
    x = [1] * len(z)
    _visualize_kernel(x, z, 'k(1,z+1)')
    return _RMSE(prediction, y_test)

Exemple #12

0

Afficher le fichier

Fichier : 神经网络应用.py Projet : churximi/DeepLearning-notes

def model_06():
    # 加载数据集
    X_train, Y_train, X_test, Y_test = load_dataset()  # 数据

    # 设置参数
    layers_dims = [X_train.shape[0], 1]
    num_iter = 2000
    learning_rate = 0.5
    print_cost = False
    initialization = "he"

    parameters, costs = basic_model(X_train,
                                    Y_train,
                                    layers_dims=layers_dims,
                                    num_iter=num_iter,
                                    lr=learning_rate,
                                    print_cost=print_cost,
                                    initialization=initialization)

    # 预测及评估
    prediction_train = predict(parameters, X_train)
    prediction_test = predict(parameters, X_test)

    print("Train准确率: {}".format(evaluate(prediction_train, Y_train)))
    print("test准确率: {}".format(evaluate(prediction_test, Y_test)))

    plt.title("Model with He initialization")
    axes = plt.gca()
    axes.set_xlim([-1.5, 1.5])
    axes.set_ylim([-1.5, 1.5])
    plot_decision_boundary(lambda x: predict(parameters, x.T), X_train,
                           Y_train)
    plt.show()

Exemple #13

0

Afficher le fichier

    def train(self,
              epoch=25,
              batch_size=1,
              learning_rate=0.0002,
              momentum=0.9,
              decay=0.95,
              data_dir="data",
              dataset_name="cnn",
              vocab_size=1000000):
        if not self.vocab:
            self.vocab, self.rev_vocab = load_vocab(data_dir, dataset_name,
                                                    vocab_size)

        self.opt = tf.train.RMSPropOptimizer(learning_rate,
                                             decay=decay,
                                             momentum=momentum)

        for epoch_idx in xrange(epoch):
            data_loader = load_dataset(data_dir, dataset_name, vocab_size)

            contexts, questions, answers = [], [], []
            for batch_idx in xrange(batch_size):
                _, context, question, answer, _ = data_loader.next()
                contexts.append(context)
                questions.append(question)
                answers.append(answers)

Exemple #14

0

Afficher le fichier

Fichier : main.py Projet : newton1010/ASUS

def main():
    config = get_config()
    config = init_env(config)
    datasets = data_utils.load_dataset(config)

    eval_metric = FewShotMetrics(config, datasets)
    if config.eval:
        model = Model.load(config, config.load_checkpoint)
    else:
        if config.load_checkpoint:
            model = Model.load(config, config.load_checkpoint)
        else:
            word_dict = datasets['train'].word_dict
            classes = datasets['train'].classes
            model = Model(config, word_dict, classes)
        model.train(datasets['train'], datasets['dev'], eval_metric)
        model.load_best()
    test_loader = data_utils.get_dataset_loader(config,
                                                datasets['test'],
                                                train=False)
    evaluate(config,
             model,
             test_loader,
             eval_metric,
             split='test',
             dump=not config.eval)

Exemple #15

0

Afficher le fichier

Fichier : trainer.py Projet : zheng-yanan/collection-of-memory-attention-mechanisms

def main(_):

    data_path = 'data/new-dataset-cornell-length10-filter1-vocabSize40000.pkl'
    word2id, id2word, trainingSamples = load_dataset(data_path)
    hparam = Config()

    with tf.Session() as sess:

        model = Seq2SeqModel(hparam, word2id)
        ckpt = tf.train.get_checkpoint_state(hparam.save_path)

        if FLAGS.resume and ckpt and tf.train.checkpoint_exists(
                ckpt.model_checkpoint_path):
            print("Restoring model parameters from %s." %
                  ckpt.model_checkpoint_path)
            model.saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            print("Creating model with fresh parameters.")
            sess.run(model.init)

        train_writer = tf.summary.FileWriter(hparam.save_path,
                                             graph=sess.graph)

        for epoch in range(hparam.num_epoch):
            print("Starting Epoch {}/{}:".format(epoch, hparam.num_epoch))

            batches = get_batches(trainingSamples, hparam.batch_size)
            total_loss = 0.0
            total_count = 0

            for nextBatch in tqdm(batches, desc="training"):

                outputs = model.train_session(sess, nextBatch)

                loss = outputs["loss"]
                summary = outputs["summary"]
                step = outputs["step"]
                train_writer.add_summary(summary, step)
                total_loss += loss
                total_count += 1

                if step % hparam.display_per_step == 0:

                    perplexity = math.exp(
                        float(total_loss / total_count)
                    ) if total_loss / total_count < 300 else float('inf')
                    tqdm.write(
                        " Step %d | Per-word Loss %.4f | Perplexity %.4f" %
                        (step, total_loss / total_count, perplexity))

                    checkpoint_path = os.path.join(hparam.save_path,
                                                   hparam.model_name)
                    model.saver.save(sess, checkpoint_path)

            tqdm.write("\n")
            tqdm.write(" Epoch %d | Per-word Loss %.4f | Perplexity %.4f" %
                       (epoch, total_loss / total_count, perplexity))
            tqdm.write("\n")

Exemple #16

0

Afficher le fichier

def log_reg_GD(dataset='iris', lr_rates=[0.1], method='SGD', total_iter=2000):
    x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(dataset)
    y_train, y_valid, y_test = y_train[:, (1, )], y_valid[:,
                                                          (1, )], y_test[:,
                                                                         (1, )]

    y_train, y_valid, y_test = _cast_TF(y_train), _cast_TF(y_valid), _cast_TF(
        y_test)

    x_train = np.vstack([x_train, x_valid])
    y_train = np.vstack([y_train, y_valid])

    X = np.ones((len(x_train), len(x_train[0]) + 1))
    X[:, 1:] = x_train

    X_test = np.ones((len(x_test), len(x_test[0]) + 1))
    X_test[:, 1:] = x_test

    test_accuracies = []
    test_logs = []
    neg_log = {}

    for rate in lr_rates:

        w = np.zeros(np.shape(X[0, :]))
        neg_log[rate] = []
        bar = tqdm.tqdm(total=total_iter, desc='Iter', position=0)
        for iteration in range(total_iter):
            bar.update(1)

            estimates = X @ w
            estimates = estimates.reshape(np.shape(y_train))

            if method == 'SGD':
                i = random.randint(0, len(y_train) - 1)
                grad_L = (y_train[i] - _sigmoid(estimates[i])) * X[i, :]

            elif method == 'GD':
                grad_L = np.zeros(np.shape(w))
                for i in range(len(y_train)):
                    grad_L += (y_train[i] - _sigmoid(estimates[i])) * X[i, :]

            w = w + (rate * grad_L)
            L = _log_likelihood(estimates, y_train)
            neg_log[rate].append(-L)

        test_estimates = np.dot(X_test, w)
        test_estimates = test_estimates.reshape(np.shape(y_test))
        predictions = np.zeros(np.shape(y_test))
        for i in range(len(predictions)):
            p = _sigmoid(test_estimates[i])
            predictions[i] = (p >= 1 / 2)

        test_accuracies.append(_Q1_compute_acc(y_test, predictions))
        test_logs.append(_log_likelihood(test_estimates, y_test))

    return neg_log, test_accuracies, test_logs

Exemple #17

0

Afficher le fichier

def test():
    parser = argparse.ArgumentParser()
    parser.add_argument('--target', choices=['vitB1', 'vitB12', 'folate'])
    parser.add_argument('--modelType', choices=['lr', 'svc', 'rf', 'knn'])
    parser.add_argument('--reverse', action='store_true')
    opt = parser.parse_args()

    # threshold
    th_dict = dict()
    th_dict['vitB1'] = 30
    th_dict['vitB12'] = 180
    th_dict['folate'] = 4

    # load the dataset
    x_df, y_df, date = data_utils.load_dataset(target=opt.target)

    # preprocess the dataset
    x_data, y_data, weight = data_utils.preprocess_dataset(x_df, y_df, th=th_dict[opt.target])

    # split into train and test
    n_train = np.sum(date < 20170000)
    if opt.reverse:
        x_data, y_data = x_data[::-1], y_data[::-1]
    x_data, x_test, y_data, y_test = train_test_split(x_data, y_data,
                                                      train_size=n_train,
                                                      shuffle=False)

    # model
    if opt.modelType == 'lr':
        model = LogisticRegression(C=1e1, random_state=42, class_weight={1: weight})
    elif opt.modelType == 'svc':
        model = SVC(kernel='rbf', C=1e6, gamma=1e-9, class_weight={1: weight},
                    probability=True, random_state=42)
    elif opt.modelType == 'rf':
        model = RandomForestClassifier(n_estimators=50,
                                       min_samples_split=2,
                                       max_depth=10,
                                       class_weight={1: weight},
                                       random_state=42)
    elif opt.modelType == 'knn':
        model = KNeighborsClassifier(algorithm='auto',
                                     leaf_size=1,
                                     metric='minkowski',
                                     metric_params=None,
                                     n_jobs=1,
                                     n_neighbors=37,
                                     p=1,
                                     weights='uniform')

    # fit and predict
    model.fit(x_data, y_data)
    prob_test = model.predict_proba(x_test)[:, 1]

    # evaluation
    auc_value = roc_auc_score(y_test, prob_test)
    print('AUC: {:.4f}'.format(auc_value))
    draw_roc(y_test, prob_test, opt.modelType)

Exemple #18

0

Afficher le fichier

def TimeTaken(d):
    x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(
        'rosenbrock', n_train=5000, d=d)
    print(np.shape(x_test[0]))
    print(np.shape(x_train[0]))
    time_init = time.time()
    Test_Error_Tree(x_train, y_train, x_test, y_test)
    time_end = time.time()
    return time_end - time_init

Exemple #19

0

Afficher le fichier

def main(_):
    pp.pprint(flags.FLAGS.__flags)

    if not FLAGS.model_dir:
        print(" [-] Error: Model dir is not set!")
        exit(-1)

    if not os.path.exists(FLAGS.model_dir):
        print(" [*] Creating model directory...")
        os.makedirs(FLAGS.model_dir)

    with open(os.path.join(FLAGS.model_dir, "config.json"),
              'w') as config_file:
        config_file.write("%s" % (pp.pformat(flags.FLAGS.__flags)))

    # build model
    model = model_dict[FLAGS.model](vocab_size=FLAGS.vocab_size,
                                    size=FLAGS.cell_size,
                                    cell_type=FLAGS.cell)
    # load data
    print(" [*] Loading dataset...")
    train_data = data_utils.load_dataset(FLAGS.data_dir,
                                         FLAGS.dataset,
                                         FLAGS.vocab_size,
                                         FLAGS.max_nsteps,
                                         part="training")
    dev_data = data_utils.load_dataset(FLAGS.data_dir,
                                       FLAGS.dataset,
                                       FLAGS.vocab_size,
                                       FLAGS.max_nsteps,
                                       part="validation")
    print(" [+] Finish loading. Train set: %d, Dev set: %d" %
          (len(train_data), len(dev_data)))

    #model.train(train_data, dev_data, nb_epoch=FLAGS.epoch, batch_size=FLAGS.batch_size, model_dir=FLAGS.model_dir)
    model.batch_train(train_data,
                      dev_data,
                      nb_epoch=FLAGS.epoch,
                      batch_size=FLAGS.batch_size,
                      model_dir=FLAGS.model_dir,
                      evaluate_every=FLAGS.evaluate_every,
                      checkpoint_every=FLAGS.checkpoint_every)

Exemple #20

0

Afficher le fichier

def convert_h5(data_dir, label_dir, data_split, f):

    if data_split:
        train_file_paths, test_file_paths = apply_split(
            data_split, data_dir, label_dir)
    else:
        raise ValueError('Please provide the split ratio')

    print("Training dataset size: ", len(train_file_paths))
    print("Testing dataset size: ", len(test_file_paths))

    # data_train = list of 3D numpy array of training volumes
    # label_train = list of 3D numpy array of training labels
    # _ = list of header of training volumes
    print("Loading and pre-processing Training data...")
    data_train, label_train, _ = du.load_dataset(train_file_paths)
    _write_h5(data_train, label_train, f, mode="train")

    print("Loading and pre-processing Testing data...")
    data_test, label_test, _ = du.load_dataset(test_file_paths)
    _write_h5(data_test, label_test, f, mode="test")

Exemple #21

0

Afficher le fichier

def _svd_classification(dataset='mnist_small'):
    """
    svd on classificaiton dataset

    Inputs:
        dataset: (str) name of dataset

    Outputs:
        accuracy on predicted values
    """
    if dataset == 'rosenbrock':
        x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(
            'rosenbrock', n_train=5000, d=2)
    else:
        x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(
            dataset)

    x_total = np.vstack([x_train, x_valid])
    y_total = np.vstack([y_train, y_valid])

    X = np.ones((len(x_total), len(x_total[0]) + 1))
    X[:, 1:] = x_total

    U, S, Vh = np.linalg.svd(X)

    # Invert Sigma
    sig = np.diag(S)
    filler = np.zeros([len(x_total) - len(S), len(S)])
    sig_inv = np.linalg.pinv(np.vstack([sig, filler]))

    # Compute weights
    w = Vh.T @ (sig_inv @ (U.T @ y_total))

    # Make test predictions
    X_test = np.ones((len(x_test), len(x_test[0]) + 1))
    X_test[:, 1:] = x_test
    predictions = np.argmax(X_test @ w, axis=1)
    y_test = np.argmax(1 * y_test, axis=1)

    return (predictions == y_test).sum() / len(y_test)

Exemple #22

0

Afficher le fichier

def _svd_regression(dataset='mauna_loa'):
    """
    svd on regression dataset

    Inputs:
        dataset: (str) name of dataset

    Outputs:
        RMSE on predicted values
    """
    if dataset == 'rosenbrock':
        x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(
            'rosenbrock', n_train=5000, d=2)
    else:
        x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(
            dataset)

    x_total = np.vstack([x_train, x_valid])
    y_total = np.vstack([y_train, y_valid])

    X = np.ones((len(x_total), len(x_total[0]) + 1))
    X[:, 1:] = x_total

    U, S, Vh = np.linalg.svd(X)

    # Invert Sigma
    sig = np.diag(S)
    filler = np.zeros([len(x_total) - len(S), len(S)])
    sig_inv = np.linalg.pinv(np.vstack([sig, filler]))

    # Compute weights
    w = Vh.T @ (sig_inv @ (U.T @ y_total))

    # Make test predictions
    X_test = np.ones((len(x_test), len(x_test[0]) + 1))
    X_test[:, 1:] = x_test
    predictions = X_test @ w

    return _RMSE(y_test, predictions)

Exemple #23

0

Afficher le fichier

def predict_cross_val(dataset='mauna_loa', k=2, dist_metric='l2', v=5):
    """
    cross validation technique on knn and output predicted values

    Inputs:
        dataset: (str) name of dataset
        k: (list) k[0]:lower bound of number of nearest neighbours; k[1]:upper bound of number of nearest neighbours
        dist_metric: (str) 'l1' or 'l2'
        v: (int) cross validation parameter, number of cross folds

    Outputs:
        [predict_x,GroundTruth_y,predicted_y]
    """
    x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(dataset)

    x_train = np.vstack([x_valid, x_train])
    y_train = np.vstack([y_valid, y_train])

    np.random.seed(42)
    np.random.shuffle(x_train)
    np.random.seed(42)
    np.random.shuffle(y_train)

    data_partition = _partition_fold(v=v, data=x_train)
    predicted_y = np.empty((0, y_train.shape[-1]))
    for fold in range(v):
        print('------Processing Fold ' + str(fold + 1) + ' ------')
        train_x = np.delete(x_train, data_partition[fold], axis=0)
        train_y = np.delete(y_train, data_partition[fold], axis=0)
        query_x = np.take(x_train, data_partition[fold], axis=0)
        query_y = np.take(y_train, data_partition[fold], axis=0)

        curr_predict = _eval_knn([k, k + 1],
                                 train_x,
                                 train_y,
                                 query_x,
                                 query_y,
                                 dist_metric=dist_metric,
                                 compute_loss=False)
        #print(curr_predict.shape)
        predicted_y = np.append(predicted_y,
                                curr_predict['k=' + str(k)],
                                axis=0)

    rval = []
    for idx in range(x_train.shape[0]):
        rval.append((x_train[idx], y_train[idx], predicted_y[idx]))

    rval.sort(key=lambda tup: tup[0])
    return [i[0] for i in rval], [i[1] for i in rval], [i[2] for i in rval]

Exemple #24

0

Afficher le fichier

Fichier : BayesianInference.py Projet : atkinssamuel/BayesianInference

def question1a():
    x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset('iris')
    y_train, y_valid, y_test = y_train[:, (1,)], y_valid[:, (1,)], y_test[:, (1,)]
    learningRate = 0.0001
    maxIterations = 1000
    x_train, x_test = np.vstack((x_train, x_valid)), x_test
    y_train, y_test = np.vstack((y_train, y_valid)), y_test
    varianceList = [0.5, 1, 2]
    print("\nResults for question 1:\n")
    for variance in varianceList:
        margLikelihood, iterations, w, H = laplaceApproximation(x_train, x_test, y_train, y_test, learningRate, variance, maxIterations)
        print("For a variance of {}:".format(variance))
        print("Iterations = {}".format(iterations))
        print("Marginal log likelihood = {}\n".format(margLikelihood))

Exemple #25

0

Afficher le fichier

Fichier : main.py Projet : lucapalazzi/reviews-sentiment

def load_initial_dataset():
    dataset_folder = Path("../datasets/")
    try:
        # Try to load a cached version of the dataframe
        print("Trying to load the cached dataframe...")
        df = pd.read_pickle(dataset_folder / 'cached_dataframe.pkl2')
        print("Done")
    except:
        print("No cached dataframe, loading the dataset from disk")
        path_file = dataset_folder / 'Cell_Phones_and_Accessories_5.json'
        df = load_dataset(path_file)
        # Store the dataframe on disk
        print("Caching the dataframe")
        df.to_pickle(dataset_folder / 'cached_dataframe.pkl2')
    return df

Exemple #26

0

Afficher le fichier

Fichier : kNNPreprocessing.py Projet : Joanna-Zhou/ML-Basics

def loadData(datasetName, d=2):
    '''
    Loads the dataset and normalize the x_ sets
    INPUT: datasetName: a string of the name of file to be loaded. Note that this file must be in the same path as this file
    OUTPUT: 6 datasets in array form, 3 of which are normalized x data
    '''
    if datasetName == 'rosenbrock':
        x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(
            datasetName, n_train=1000, d=d)
    else:
        x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(
            datasetName)

    x_all = np.concatenate([x_train, x_valid])
    y_all = np.concatenate([y_train, y_valid])
    index_all = list(range(np.shape(x_all)[0]))
    random.shuffle(index_all)

    # Normalizetion of each x data
    mean = x_all.mean(axis=0, keepdims=True)
    stddev = x_all.std(axis=0, keepdims=True)
    x_all = normalization(x_all, mean, stddev)
    x_test = normalization(x_test, mean, stddev)
    return index_all, x_all, x_test, y_all, y_test

Exemple #27

0

Afficher le fichier

def _kd_tree(dataset='rosenbrock', dist_metric='l2', k=5, d=2):
    """
    knn using kd_tree

    Inputs:
        dataset: (str) name of dataset
        k: (int) number of nearest neighbours
        dist_metric: (str) 'l1' or 'l2'
        d: (int) data dimensionality

    Outputs:
        RMSE on predicted values
    """
    x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(
        dataset, n_train=5000, d=d)
    kdt = neighbors.KDTree(x_train)
    _, index = kdt.query(x_test, k=k)
    predictions = np.sum(y_train[index], axis=1) / k
    return _RMSE(y_test, predictions)

Exemple #28

0

Afficher le fichier

def _test_classification(dataset='iris', k_range=[1, 2], dist_metric='l1'):
    """
    run knn and output predicted values on classificaiton test data

    Inputs:
        dataset: (str) name of dataset
        k_range: (list) k[0]:lower bound of number of nearest neighbours; k[1]:upper bound of number of nearest neighbours
        dist_metric: (str) 'l1' or 'l2'


    Outputs:
        accuracy of predicted values referred to GroundTruth
    """

    print('------Processing Dataset ' + dataset + ' ------')

    x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(dataset)

    if y_train.dtype == np.dtype('bool'):
        y_train = _cast_TF(y_train)
        y_valid = _cast_TF(y_valid)
        y_test = _cast_TF(y_test)
    acc = []
    predicted = _eval_knn(k_range,
                          x_train,
                          y_train,
                          x_test,
                          y_test,
                          dist_metric,
                          compute_loss=False)
    for k in range(k_range[0], k_range[1]):
        curr_predict = predicted['k=' + str(k)]
        result = np.argmax(curr_predict, axis=1)
        gt = np.where(y_test == True, 1, 0)
        gt = np.argmax(gt, axis=1)
        #print(result-gt)
        #break

        unique, counts = np.unique(result - gt, return_counts=True)
        correct = dict(zip(unique, counts))[0]
        acc.append(correct / y_test.shape[0])

    return acc

Exemple #29

0

Afficher le fichier

def create_test_train_fold(fold_num):
    """Splits the dataset into training and held-out test set."""
    data_x, data_y, _ = data_utils.load_dataset(FLAGS.dataset_name)
    tf.logging.info('Dataset: %s, Size: %d', FLAGS.dataset_name,
                    data_x.shape[0])
    tf.logging.info('Cross-val fold: %d/%d', FLAGS.fold_num, _N_FOLDS)
    # Get the training and test set based on the StratifiedKFold split
    (x_train_all, y_train_all), test_dataset = data_utils.get_train_test_fold(
        data_x,
        data_y,
        fold_num=fold_num,
        num_folds=_N_FOLDS,
        stratified=not FLAGS.regression)
    data_gen = data_utils.split_training_dataset(
        x_train_all,
        y_train_all,
        FLAGS.num_splits,
        stratified=not FLAGS.regression)
    return data_gen, test_dataset

Exemple #30

0

Afficher le fichier

Fichier : deep_bi_lstm.py Projet : BinbinBian/attentive-reader-tensorflow

  def train(self, epoch=25, batch_size=1,
            learning_rate=0.0002, momentum=0.9, decay=0.95,
            data_dir="data", dataset_name="cnn", vocab_size=1000000):
    if not self.vocab:
      self.vocab, self.rev_vocab = load_vocab(data_dir, dataset_name, vocab_size)

    self.opt = tf.train.RMSPropOptimizer(learning_rate,
                                         decay=decay,
                                         momentum=momentum)

    for epoch_idx in xrange(epoch):
      data_loader = load_dataset(data_dir, dataset_name, vocab_size)

      contexts, questions, answers = [], [], []
      for batch_idx in xrange(batch_size):
        _, context, question, answer, _ = data_loader.next()
        contexts.append(context)
        questions.append(question)
        answers.append(answers)

Exemple #31

0

Afficher le fichier

Fichier : bayesian_main.py Projet : thomas-enxuli/Bayesian-Model

def run_Q1a(dataset='iris', lr=0.001):
    x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(dataset)
    y_train, y_valid, y_test = y_train[:, (1, )], y_valid[:,
                                                          (1, )], y_test[:,
                                                                         (1, )]

    x_train, x_test = np.vstack((x_train, x_valid)), x_test
    y_train, y_test = np.vstack((y_train, y_valid)), y_test
    var_list = [0.5, 1, 2]

    X_train = _generate_X(x_train)
    X_test = _generate_X(x_test)

    marginal_likelihoods, rval_w = {}, None

    for variance in var_list:

        w = np.zeros(np.shape(X_train[0]))

        x_prod = np.reshape(X_train @ w, np.shape(y_train))
        posterior_grad = _likelihood_grad(X_train, x_prod,
                                          y_train) + _prior_grad(w, variance)

        while 1:
            if max(posterior_grad) < 10**(-2): break
            x_prod = X_train @ w
            posterior_grad = _likelihood_grad(
                X_train, x_prod, y_train) + _prior_grad(w, variance)
            w = w + (lr * posterior_grad)
        hessian = _likelihood_hess(X_train, x_prod) + _prior_hess(w, variance)

        marginal_likelihoods[variance] = _log_likelihood(
            x_prod, y_train) + _log_prior(w, variance) - _log_g(hessian)
        if variance == 1: rval_w = w

    print(marginal_likelihoods)
    print(rval_w)
    return marginal_likelihoods, rval_w

Exemple #32

0

Afficher le fichier

Fichier : deep_lstm.py Projet : carpedm20/attentive-reader-tensorflow

    def train(
        self,
        sess,
        vocab_size,
        epoch=25,
        learning_rate=0.0002,
        momentum=0.9,
        decay=0.95,
        data_dir="data",
        dataset_name="cnn",
    ):
        self.prepare_model(data_dir, dataset_name, vocab_size)

        start = time.clock()
        print(" [*] Calculating gradient and loss...")
        self.optim = tf.train.AdamOptimizer(learning_rate, 0.9).minimize(self.loss)
        print(" [*] Calculating gradient and loss finished. Take %.2fs" % (time.clock() - start))

        # Could not use RMSPropOptimizer because the sparse update of RMSPropOptimizer
        # is not implemented yet (2016.01.24).
        # self.optim = tf.train.RMSPropOptimizer(learning_rate,
        #                                        decay=decay,
        #                                        momentum=momentum).minimize(self.loss)

        sess.run(tf.initialize_all_variables())

        if self.load(sess, self.checkpoint_dir, dataset_name):
            print(" [*] Deep LSTM checkpoint is loaded.")
        else:
            print(" [*] There is no checkpoint for this model.")

        y = np.zeros([self.batch_size, self.vocab_size])

        merged = tf.merge_all_summaries()
        writer = tf.train.SummaryWriter("/tmp/deep", sess.graph_def)

        counter = 0
        start_time = time.time()
        for epoch_idx in xrange(epoch):
            data_loader = load_dataset(data_dir, dataset_name, vocab_size)

            batch_stop = False
            while True:
                y.fill(0)
                inputs, nstarts, answers = [], [], []
                batch_idx = 0
                while True:
                    try:
                        (_, document, question, answer, _), data_idx, data_max_idx = data_loader.next()
                    except StopIteration:
                        batch_stop = True
                        break

                    # [0] means splitter between d and q
                    data = (
                        [int(d) for d in document.split()]
                        + [0]
                        + [int(q) for q in question.split() for q in question.split()]
                    )

                    if len(data) > self.max_nsteps:
                        continue

                    inputs.append(data)
                    nstarts.append(len(inputs[-1]) - 1)
                    y[batch_idx][int(answer)] = 1

                    batch_idx += 1
                    if batch_idx == self.batch_size:
                        break
                if batch_stop:
                    break

                FORCE = False
                if FORCE:
                    inputs = array_pad(inputs, self.max_nsteps, pad=-1, force=FORCE)
                    nstarts = np.where(inputs == -1)[1]
                    inputs[inputs == -1] = 0
                else:
                    inputs = array_pad(inputs, self.max_nsteps, pad=0)
                nstarts = [[nstart, idx, 0] for idx, nstart in enumerate(nstarts)]

                _, summary_str, cost, accuracy = sess.run(
                    [self.optim, merged, self.loss, self.accuracy],
                    feed_dict={self.inputs: inputs, self.nstarts: nstarts, self.y: y},
                )
                if counter % 10 == 0:
                    writer.add_summary(summary_str, counter)
                    print(
                        "Epoch: [%2d] [%4d/%4d] time: %4.4f, loss: %.8f, accuracy: %.8f"
                        % (epoch_idx, data_idx, data_max_idx, time.time() - start_time, np.mean(cost), accuracy)
                    )
                counter += 1
            self.save(sess, self.checkpoint_dir, dataset_name)