Ejemplo n.º 1
0
def main():
    train_file = os.path.join('../data', 'train.jsonl')
    df = data.get_train(train_file)
    #phase_1_evaluate_small_random_samples(df)
    train_set, _ = train_test_split(
        df, train_size=5,
        random_state=seed)  # need to look into NEI ratios and balance
    phase_1(train_set)
Ejemplo n.º 2
0
def main():
    np.random.seed(0)  #
    inp, labels = get_train(True)
    N = len(labels)
    train_inp, train_labels = inp[:N * 4 // 5], labels[:N * 4 // 5]
    test_inp, test_labels = inp[N * 4 // 5:N], labels[N * 4 // 5:N]
    weights = initialize([2500, 1])
    # train_labels = np.zeros((2000))
    # weights = [np.ones((3,3)), np.ones((3,1))]
    # train_inp = np.ones((2,1,3))
    losses = []
    for i in range(10):
        out, train_pred = feed_forward(weights, train_inp)
        losses.append(get_error(train_labels, train_pred))
        weights = train_logreg(weights, out, train_labels, train_pred)
    print('***TRAIN***')
    evaluate(train_labels, train_pred)
    print('***TEST***')
    _, test_pred = feed_forward(weights, test_inp)
    evaluate(test_labels, test_pred)
    plot(losses, '0 hidden layers')
Ejemplo n.º 3
0
def main():
    np.random.seed(0) # 0 42 50
    inp, labels = get_train(True)
    N = len(labels)
    train_inp, train_labels = inp[:N*4//5], labels[:N*4//5]
    test_inp, test_labels = inp[N*4//5:N], labels[N*4//5:N]
    for n_units in [30, 40, 50]:
        weights = initialize([2500, n_units, 1])
        # train_labels = np.zeros((2000))
        # weights = [np.ones((3,3)), np.ones((3,1))]
        # train_inp = np.ones((2,1,3))
        losses = []
        for i in range(150):
            out, train_pred = feed_forward(weights, train_inp)
            losses.append(get_error(train_labels, train_pred))
            weights = backpropagate(weights, out, train_labels, train_pred)
        print('***TRAIN***')
        evaluate(train_labels, train_pred)
        print('***TEST***')
        _, test_pred = feed_forward(weights, test_inp)
        evaluate(test_labels, test_pred)
        plot(losses, f'Hidden Units = {n_units}')
Ejemplo n.º 4
0
metadata_path_all = glob.glob(sys.argv[1] + "*")

print("shape of metadata_path_all")
print(len(metadata_path_all))

if len(sys.argv) >= 3:
    subset = sys.argv[2]
    assert subset in ['train', 'valid', 'test', 'train_valid']
else:
    subset = 'test'

if subset == "test":
    X, mask, _, num_seq = data.get_test()
elif subset == "train":
    X_train, _, _, _, mask_train, _, num_seq = data.get_train()
elif subset == "train_valid":
    X_train, X_valid, _, _, mask_train, mask_valid, num_seq = data.get_train()
    X = np.concatenate((X_train[:-30], X_valid))
    mask = np.concatenate((mask_train[:-30], mask_valid))
else:
    _, X, _, _, _, mask, num_seq = data.get_train()

for metadata_path in metadata_path_all:

    print("Loading metadata file %s" % metadata_path)

    metadata = np.load(metadata_path)

    config_name = metadata['config_name']
Ejemplo n.º 5
0
def main():
    sym_y = T.imatrix('target_output')
    sym_mask = T.matrix('mask')
    sym_x = T.tensor3()

    TOL = 1e-5
    num_epochs = config.epochs
    batch_size = config.batch_size

    #### DATA ####
    #    print "@@@@TESTING@@@@"
    #    l_in = nn.layers.InputLayer(shape=(None, 700, 42))
    #    l_dim_a = nn.layers.DimshuffleLayer(
    #        l_in, (0,2,1))
    #    l_conv_a = nn.layers.Conv1DLayer(
    #        incoming=l_dim_a, num_filters=42, border_mode='same',
    #        filter_size=3, stride=1, nonlinearity=nn.nonlinearities.rectify)
    #    l_dim_b = nn.layers.DimshuffleLayer(
    #        l_conv_a, (0,2,1))
    #    out = nn.layers.get_output(l_dim_b, sym_x)
    #    testvar = np.ones((128, 700, 42)).astype('float32')
    #    print "@@@@EVAL@@@@"
    #    john = out.eval({sym_x: testvar})
    #    print("Johns shape")
    #    print(john.shape)

    print("Building network ...")
    ##########################DEBUG##########################
    l_in, l_out = config.build_model()

    ##########################DEBUG##########################
    all_layers = nn.layers.get_all_layers(l_out)
    num_params = nn.layers.count_params(l_out)
    print("  number of parameters: %d" % num_params)
    print("  layer output shapes:")
    for layer in all_layers:
        name = layer.__class__.__name__
        print("    %s %s" % (name, nn.layers.get_output_shape(layer)))
    print("Creating cost function")
    # lasagne.layers.get_output produces a variable for the output of the net
    out_train = nn.layers.get_output(
        l_out, sym_x, deterministic=False)

    #    testvar = np.ones((128, 700, 42)).astype('float32')
    #    john = out_train.eval({sym_x: testvar})
    #    print("@@@@@JOHN@@@@@")
    #    print(john.shape)
    #    print(john.reshape((-1, num_classes)).shape)

    print("Creating eval function")
    out_eval = nn.layers.get_output(
        l_out, sym_x, deterministic=True)

    probs_flat = out_train.reshape((-1, num_classes))

    lambda_reg = config.lambda_reg
    all_params = nn.layers.get_all_params(l_out)

    for i, p in enumerate(all_params):
        if p.ndim == 3:
            values = p.get_value()
            if side == 'right':
                values[..., int(values.shape[2] / 2.0 - 0.5):] = 0
                p.set_value(values)
                all_params[i] = p[..., : int(values.shape[2] / 2.0 - 0.5)]
            else:
                values[..., : int(values.shape[2] / 2.0 + 0.5)] = 0
                p.set_value(values)
                all_params[i] = p[..., int(values.shape[2] / 2.0 + 0.5):]

    params = [el for el in all_params if el.name == "W" or el.name == "gamma"]

    reg_term = sum(T.sum(p ** 2) for p in params)
    cost = T.nnet.categorical_crossentropy(T.clip(probs_flat, TOL, 1 - TOL), sym_y.flatten())
    cost = T.sum(cost * sym_mask.flatten()) / T.sum(sym_mask) + lambda_reg * reg_term

    # Retrieve all parameters from the network
    all_params = [el for el in all_params if el.name == "W" or el.name == "gamma" or el.name == "beta"]

    # Setting the weights
    if hasattr(config, 'set_weights'):
        nn.layers.set_all_param_values(l_out, config.set_weights())
    # Compute SGD updates for training
    print("Computing updates ...")
    if hasattr(config, 'learning_rate_schedule'):
        learning_rate_schedule = config.learning_rate_schedule  # Import learning rate schedule
    else:
        learning_rate_schedule = {0: config.learning_rate}
    learning_rate = theano.shared(np.float32(learning_rate_schedule[0]))

    all_grads = T.grad(cost, all_params)

    cut_norm = config.cut_grad
    updates, norm_calc = nn.updates.total_norm_constraint(all_grads, max_norm=cut_norm, return_norm=True)

    if optimizer == "rmsprop":
        updates = nn.updates.rmsprop(updates, all_params, learning_rate)
    elif optimizer == "adadelta":
        updates = nn.updates.adadelta(updates, all_params, learning_rate)
    elif optimizer == "adagrad":
        updates = nn.updates.adagrad(updates, all_params, learning_rate)
    elif optimizer == "nag":
        momentum_schedule = config.momentum_schedule
        momentum = theano.shared(np.float32(momentum_schedule[0]))
        updates = nn.updates.nesterov_momentum(updates, all_params, learning_rate, momentum)
    else:
        sys.exit("please choose either <rmsprop/adagrad/adadelta/nag> in configfile")

    # Theano functions for training and computing cost
    print ("config.batch_size %d" % batch_size)
    print ("data.num_classes %d" % num_classes)
    if hasattr(config, 'build_model'):
        print("has build model")
    print("Compiling train ...")
    # Use this for training (see deterministic = False above)
    train = theano.function(
        [sym_x, sym_y, sym_mask], [cost, out_train, norm_calc], updates=updates)

    print("Compiling eval ...")
    # use this for eval (deterministic = True + no updates)
    eval = theano.function([sym_x, sym_y, sym_mask], [cost, out_eval])

    # Start timers
    start_time = time.time()
    prev_time = start_time

    all_losses_train = []
    all_accuracy_train = []
    all_losses_eval_train = []
    all_losses_eval_valid = []
    all_losses_eval_test = []
    all_accuracy_eval_train = []
    all_accuracy_eval_valid = []
    all_accuracy_eval_test = []
    all_mean_norm = []

    import data
    X_train, X_valid, y_train, y_valid, mask_train, mask_valid, num_seq_train \
        = data.get_train()
    X_train, X_valid = X_train[..., 21:], X_valid[..., 21:]  # Only train with pssm scores

    print("y shape")
    print(y_valid.shape)
    print("X shape")
    print(X_valid.shape)
    # Start training
    for i in range(y_train.shape[0]):
        for j in range(y_train.shape[1]):
            if y_train[i][j] == 5:
                y_train[i][j] = 1
            else:
                y_train[i][j] = 0

    for i in range(y_valid.shape[0]):
        for j in range(y_valid.shape[1]):
            if y_valid[i][j] == 5:
                y_valid[i][j] = 1
            else:
                y_valid[i][j] = 0

    for epoch in range(num_epochs):

        if (epoch % 10) == 0:
            print ("Epoch %d of %d" % (epoch + 1, num_epochs))

        if epoch in learning_rate_schedule:
            lr = np.float32(learning_rate_schedule[epoch])
            print ("  setting learning rate to %.7f" % lr)
            learning_rate.set_value(lr)
        if optimizer == "nag":
            if epoch in momentum_schedule:
                mu = np.float32(momentum_schedule[epoch])
                print ("  setting learning rate to %.7f" % mu)
                momentum.set_value(mu)
        #        print "Shuffling data"
        seq_names = np.arange(0, num_seq_train)
        np.random.shuffle(seq_names)
        X_train = X_train[seq_names]
        y_train = y_train[seq_names]
        mask_train = mask_train[seq_names]

        num_batches = num_seq_train // batch_size
        losses = []
        preds = []
        norms = []
        for i in range(num_batches):
            idx = range(i * batch_size, (i + 1) * batch_size)
            x_batch = X_train[idx]
            y_batch = y_train[idx]
            mask_batch = mask_train[idx]
            loss, out, batch_norm = train(x_batch, y_batch, mask_batch)
            #            print(batch_norm)
            norms.append(batch_norm)
            preds.append(out)
            losses.append(loss)

        #            if ((i+1) % config.write_every_batch == 0) | (i == 0):
        #                if i == 0:
        #                    start_place = 0
        #                else:
        #                    start_place = i-config.write_every_batch
        #                print "Batch %d of %d" % (i + 1, num_batches)
        #                print "  curbatch training loss: %.5f" % np.mean(losses[start_place:(i+1)])
        #                print "  curbatch training acc: %.5f" % np.mean(accuracy[start_place:(i+1)])
        predictions = np.concatenate(preds, axis=0)
        loss_train = np.mean(losses)
        all_losses_train.append(loss_train)

        acc_train = utils.proteins_acc(predictions, y_train[0:num_batches * batch_size],
                                       mask_train[0:num_batches * batch_size])
        all_accuracy_train.append(acc_train)

        mean_norm = np.mean(norms)
        all_mean_norm.append(mean_norm)

        if 1 == 1:
            print ("  average training loss: %.5f" % loss_train)
            print ("  average training accuracy: %.5f" % acc_train)
            print ("  average norm: %.5f" % mean_norm)

            sets = [  # ('train', X_train, y_train, mask_train, all_losses_eval_train, all_accuracy_eval_train),
                ('valid', X_valid, y_valid, mask_valid, all_losses_eval_valid, all_accuracy_eval_valid)]
            for subset, X, y, mask, all_losses, all_accuracy in sets:
                print ("  validating: %s loss" % subset)
                preds = []
                num_batches = np.size(X, axis=0) // config.batch_size
                for i in range(num_batches):  ## +1 to get the "rest"
                    #                    print(i)
                    idx = range(i * batch_size, (i + 1) * batch_size)
                    x_batch = X[idx]
                    y_batch = y[idx]
                    mask_batch = mask[idx]
                    loss, out = eval(x_batch, y_batch, mask_batch)
                    preds.append(out)
                    #                    acc = utils.proteins_acc(out, y_batch, mask_batch)
                    losses.append(loss)
                #                    accuracy.append(acc)
                predictions = np.concatenate(preds, axis=0)
                #                print "  pred"
                #                print(predictions.shape)
                #                print(predictions.dtype)
                loss_eval = np.mean(losses)
                all_losses.append(loss_eval)

                #                acc_eval = np.mean(accuracy)
                acc_eval = utils.proteins_acc(predictions, y, mask)
                all_accuracy.append(acc_eval)

                print ("  average evaluation loss (%s): %.5f" % (subset, loss_eval))
                print ("  average evaluation accuracy (%s): %.5f" % (subset, acc_eval))

        now = time.time()
        time_since_start = now - start_time
        time_since_prev = now - prev_time
        prev_time = now
        est_time_left = time_since_prev * (num_epochs - epoch)
        eta = datetime.now() + timedelta(seconds=est_time_left)
        eta_str = eta.strftime("%c")
        print ("  %s since start (%.2f s)" % (utils.hms(time_since_start), time_since_prev))
        print ("  estimated %s to go (ETA: %s)" % (utils.hms(est_time_left), eta_str))
        print()

        if (epoch >= config.start_saving_at) and ((epoch % config.save_every) == 0):
            print ("  saving parameters and metadata")
            with open((metadata_path + side + "-%d" % (epoch) + ".pkl"), 'wb') as f:
                pickle.dump({
                    'config_name': config_name,
                    'param_values': nn.layers.get_all_param_values(l_out),
                    'losses_train': all_losses_train,
                    'accuracy_train': all_accuracy_train,
                    'losses_eval_train': all_losses_eval_train,
                    'losses_eval_valid': all_losses_eval_valid,
                    'losses_eval_test': all_losses_eval_test,
                    'accuracy_eval_valid': all_accuracy_eval_valid,
                    'accuracy_eval_train': all_accuracy_eval_train,
                    'accuracy_eval_test': all_accuracy_eval_test,
                    'mean_norm': all_mean_norm,
                    'time_since_start': time_since_start,
                    'i': i,
                }, f, pickle.HIGHEST_PROTOCOL)

            print ("  stored in %s" % metadata_path)

    print()
Ejemplo n.º 6
0
deltrnstim = np.hstack([np.roll(trnstim, d, 0) for d in delays])
delvalstim = np.hstack([np.roll(valstim, d, 0) for d in delays])

#sdeltrnstim = scipy.sparse.csr_matrix(deltrnstim)
#sdelvalstim = scipy.sparse.csr_matrix(delvalstim)

zs = lambda m: (m - m.mean(0)) / m.std(0)

sdeltrnstim = deltrnstim = np.nan_to_num(zs(deltrnstim))
sdelvalstim = delvalstim = np.nan_to_num(zs(delvalstim))

# Select some voxels
ebamask = cortex.get_roi_mask("MLfs", "20121210ML_auto1", roi="EBA")["EBA"] > 0

# Load training, test fMRI data
trndata = data.get_train(masked=ebamask)[:numtime]
valdata = data.get_val(masked=ebamask)

from ridge import _RidgeGridCV

ridge = _RidgeGridCV(alpha_min=1., alpha_max=1000., n_grid_points=5,
                     n_grid_refinements=2, cv=2)

ridge_coefs = ridge.fit(deltrnstim, trndata).coef_.T
Uridge, sridge, VridgeT = np.linalg.svd(ridge_coefs, full_matrices=False)

ranks = [1, 2, 5, 10]

results = []
corr_scores = []
r2_scores = []
Ejemplo n.º 7
0
        batch,
        NUM_SENTS,
        RETRIEVER,
        SELECTOR,
        oracle_doc_ret=isinstance(RETRIEVER, data.OracleDocRetriever),
    )


if __name__ == "__main__":
    device = "cuda" if torch.cuda.is_available() else "cpu"

    ###########################################################################
    #                        Setup the datasets/loaders                       #
    ###########################################################################

    train = data.get_train(TRAIN_PATH)
    train, test = train_test_split(train)

    torch.cuda.empty_cache()

    train_dataset = data.FastDataset(train)
    test_dataset = data.TestDataset(test)

    train_loader = DataLoader(
        train_dataset,
        batch_size=64,
        shuffle=True,
        collate_fn=prepare,
        num_workers=0,  # doesn't work with more than 1
    )
    test_loader = DataLoader(
Ejemplo n.º 8
0
def main():
    sym_y = T.imatrix('target_output')
    sym_mask = T.matrix('mask')
    sym_x = T.tensor3()

    TOL = 1e-5
    num_epochs = config.epochs
    batch_size = config.batch_size

#### DATA ####
#    print "@@@@TESTING@@@@"
#    l_in = nn.layers.InputLayer(shape=(None, 700, 42))
#    l_dim_a = nn.layers.DimshuffleLayer(
#        l_in, (0,2,1))
#    l_conv_a = nn.layers.Conv1DLayer(
#        incoming=l_dim_a, num_filters=42, border_mode='same',
#        filter_size=3, stride=1, nonlinearity=nn.nonlinearities.rectify)
#    l_dim_b = nn.layers.DimshuffleLayer(
#        l_conv_a, (0,2,1))
#    out = nn.layers.get_output(l_dim_b, sym_x)
#    testvar = np.ones((128, 700, 42)).astype('float32')
#    print "@@@@EVAL@@@@"
#    john = out.eval({sym_x: testvar})
#    print("Johns shape")
#    print(john.shape)


    print("Building network ...")
    ##########################DEBUG##########################
    l_in, l_out = config.build_model()
    
    ##########################DEBUG##########################
    all_layers = nn.layers.get_all_layers(l_out)
    num_params = nn.layers.count_params(l_out)
    print("  number of parameters: %d" % num_params)
    print("  layer output shapes:")
    for layer in all_layers:
        name = string.ljust(layer.__class__.__name__, 32)
        print("    %s %s" % (name, nn.layers.get_output_shape(layer)))
    print("Creating cost function")
    # lasagne.layers.get_output produces a variable for the output of the net
    out_train = nn.layers.get_output(
        l_out, sym_x, deterministic=False)

#    testvar = np.ones((128, 700, 42)).astype('float32')
#    john = out_train.eval({sym_x: testvar})
#    print("@@@@@JOHN@@@@@")
#    print(john.shape)
#    print(john.reshape((-1, num_classes)).shape)

    print("Creating eval function")
    out_eval = nn.layers.get_output(
        l_out, sym_x, deterministic=True)

    probs_flat = out_train.reshape((-1, num_classes))

    lambda_reg = config.lambda_reg
    params = nn.layers.get_all_params(l_out, regularizable=True)
    reg_term = sum(T.sum(p**2) for p in params)
    cost = T.nnet.categorical_crossentropy(T.clip(probs_flat, TOL, 1-TOL), sym_y.flatten())
    cost = T.sum(cost*sym_mask.flatten()) / T.sum(sym_mask) + lambda_reg * reg_term

    # Retrieve all parameters from the network
    all_params = nn.layers.get_all_params(l_out, trainable=True)
    # Setting the weights
    if hasattr(config, 'set_weights'):
        nn.layers.set_all_param_values(l_out, config.set_weights())
    # Compute SGD updates for training
    print("Computing updates ...")
    if hasattr(config, 'learning_rate_schedule'):
        learning_rate_schedule = config.learning_rate_schedule              # Import learning rate schedule
    else:
        learning_rate_schedule = { 0: config.learning_rate }
    learning_rate = theano.shared(np.float32(learning_rate_schedule[0]))

    all_grads = T.grad(cost, all_params)

    cut_norm = config.cut_grad
    updates, norm_calc = nn.updates.total_norm_constraint(all_grads, max_norm=cut_norm, return_norm=True)

    if optimizer == "rmsprop":
        updates = nn.updates.rmsprop(updates, all_params, learning_rate)
    elif optimizer == "adadelta":
        updates = nn.updates.adadelta(updates, all_params, learning_rate)
    elif optimizer == "adagrad":
        updates = nn.updates.adagrad(updates, all_params, learning_rate)
    elif optimizer == "nag":
        momentum_schedule = config.momentum_schedule
        momentum = theano.shared(np.float32(momentum_schedule[0]))
        updates = nn.updates.nesterov_momentum(updates, all_params, learning_rate, momentum)
    else:
        sys.exit("please choose either <rmsprop/adagrad/adadelta/nag> in configfile")
            
    # Theano functions for training and computing cost
    print "config.batch_size %d" %batch_size
    print "data.num_classes %d" %num_classes
    if hasattr(config, 'build_model'):
        print("has build model")
    print("Compiling train ...")
    # Use this for training (see deterministic = False above)
    train = theano.function(
        [sym_x, sym_y, sym_mask], [cost, out_train, norm_calc], updates=updates)

    print("Compiling eval ...")
    # use this for eval (deterministic = True + no updates)
    eval = theano.function([sym_x, sym_y, sym_mask], [cost, out_eval])

    # Start timers
    start_time = time.time()
    prev_time = start_time

    all_losses_train = []
    all_accuracy_train = []
    all_losses_eval_train = []
    all_losses_eval_valid = []
    all_losses_eval_test = []
    all_accuracy_eval_train = []
    all_accuracy_eval_valid = []
    all_accuracy_eval_test = []
    all_mean_norm = []


    import data
    X_train, X_valid, y_train, y_valid, mask_train, mask_valid, num_seq_train \
			= data.get_train()
    print("y shape")
    print(y_valid.shape)
    print("X shape")
    print(X_valid.shape)
    # Start training

    for epoch in range(num_epochs):

        if (epoch % 10) == 0:
            print "Epoch %d of %d" % (epoch + 1, num_epochs)

        if epoch in learning_rate_schedule:
            lr = np.float32(learning_rate_schedule[epoch])
            print "  setting learning rate to %.7f" % lr
            learning_rate.set_value(lr)
        if optimizer == "nag":
            if epoch in momentum_schedule:
                mu = np.float32(momentum_schedule[epoch])
                print "  setting learning rate to %.7f" % mu
                momentum.set_value(mu)
#        print "Shuffling data"
        seq_names = np.arange(0,num_seq_train)
        np.random.shuffle(seq_names)     
        X_train = X_train[seq_names]
        y_train = y_train[seq_names]
        mask_train = mask_train[seq_names]

        num_batches = num_seq_train // batch_size
        losses = []
        preds = []
        norms = []
        for i in range(num_batches):
            idx = range(i*batch_size, (i+1)*batch_size)
            x_batch = X_train[idx]
            y_batch = y_train[idx]
            mask_batch = mask_train[idx]
            loss, out, batch_norm = train(x_batch, y_batch, mask_batch)
#            print(batch_norm)
            norms.append(batch_norm)
            preds.append(out)
            losses.append(loss)

#            if ((i+1) % config.write_every_batch == 0) | (i == 0):
#                if i == 0:
#                    start_place = 0
#                else:
#                    start_place = i-config.write_every_batch
#                print "Batch %d of %d" % (i + 1, num_batches)
#                print "  curbatch training loss: %.5f" % np.mean(losses[start_place:(i+1)])
#                print "  curbatch training acc: %.5f" % np.mean(accuracy[start_place:(i+1)])
        predictions = np.concatenate(preds, axis = 0)
        loss_train = np.mean(losses)
        all_losses_train.append(loss_train)

        acc_train = utils.proteins_acc(predictions, y_train[0:num_batches*batch_size], mask_train[0:num_batches*batch_size])
        all_accuracy_train.append(acc_train)

        mean_norm = np.mean(norms)
        all_mean_norm.append(mean_norm)

        if 1==1:
            print "  average training loss: %.5f" % loss_train
            print "  average training accuracy: %.5f" % acc_train
            print "  average norm: %.5f" % mean_norm

            sets = [#('train', X_train, y_train, mask_train, all_losses_eval_train, all_accuracy_eval_train),
                    ('valid', X_valid, y_valid, mask_valid, all_losses_eval_valid, all_accuracy_eval_valid)]
            for subset, X, y, mask, all_losses, all_accuracy in sets:
                print "  validating: %s loss" % subset
                preds = []
                num_batches = np.size(X,axis=0) // config.batch_size
                for i in range(num_batches): ## +1 to get the "rest"
#                    print(i)
                    idx = range(i*batch_size, (i+1)*batch_size)
                    x_batch = X[idx]
                    y_batch = y[idx]
                    mask_batch = mask[idx]
                    loss, out = eval(x_batch, y_batch, mask_batch)
                    preds.append(out)
#                    acc = utils.proteins_acc(out, y_batch, mask_batch)
                    losses.append(loss)
#                    accuracy.append(acc)
                predictions = np.concatenate(preds, axis = 0)
#                print "  pred"
#                print(predictions.shape)
#                print(predictions.dtype)
                loss_eval = np.mean(losses)
                all_losses.append(loss_eval)

#                acc_eval = np.mean(accuracy)
                acc_eval = utils.proteins_acc(predictions, y, mask)
                all_accuracy.append(acc_eval)

                print "  average evaluation loss (%s): %.5f" % (subset, loss_eval)
                print "  average evaluation accuracy (%s): %.5f" % (subset, acc_eval)

        now = time.time()
        time_since_start = now - start_time
        time_since_prev = now - prev_time
        prev_time = now
        est_time_left = time_since_start * num_epochs
        eta = datetime.now() + timedelta(seconds=est_time_left)
        eta_str = eta.strftime("%c")
        print "  %s since start (%.2f s)" % (utils.hms(time_since_start), time_since_prev)
        print "  estimated %s to go (ETA: %s)" % (utils.hms(est_time_left), eta_str)
        print

        if (epoch >= config.start_saving_at) and ((epoch % config.save_every) == 0):
            print "  saving parameters and metadata"
            with open((metadata_path + "-%d" % (epoch) + ".pkl"), 'w') as f:
                pickle.dump({
                        'config_name': config_name,
                        'param_values': nn.layers.get_all_param_values(l_out),
                        'losses_train': all_losses_train,
                        'accuracy_train': all_accuracy_train,
                        'losses_eval_train': all_losses_eval_train,
                        'losses_eval_valid': all_losses_eval_valid,
			'losses_eval_test': all_losses_eval_test,
                        'accuracy_eval_valid': all_accuracy_eval_valid,
                        'accuracy_eval_train': all_accuracy_eval_train,
			'accuracy_eval_test': all_accuracy_eval_test,
                        'mean_norm' : all_mean_norm,
                        'time_since_start': time_since_start,
                        'i': i,
                    }, f, pickle.HIGHEST_PROTOCOL)

            print "  stored in %s" % metadata_path

    print
Ejemplo n.º 9
0
# -*- coding: utf-8 -*-


import numpy as np
import data


#%% Simple Mean-based Modell

# define the logmean    
def logmean(x):
    return np.exp(np.mean(np.log(x+1)))-1

# load the training data
print('load training data')
df_train = data.get_train(nrows = 10000)

# compute the means for different configurations
print('compute means')
mean_tab = df_train.groupby('ProductId').agg({'AdjDemand': logmean})
mean_tab2 = df_train.groupby(['ProductId', 'ClientId']).agg({'AdjDemand': logmean})
global_mean = logmean(df_train['AdjDemand'])


# generate estimation for each ProductID-ClientID-pair
def estimate(key):
    key = tuple(key) # key needs to be a tuple
    try:
        est = mean_tab2.at[key,'AdjDemand']
    except KeyError:
        try :
Ejemplo n.º 10
0
def main():
    #unzip raw_content file
    os.system("unzip zipRawcontent; mkdir data; mv raw_content data/")
    os.system("cp trainfile data/train.tsv")
    os.system("cp testfile data/test.tsv")
    os.system("mkdir ../generated ")
 
    data = get_train() + get_test()

    f = file('extracted_text', 'w')

    for i, item in enumerate(data):
        # status update
        if (i % 500) == 0:
            print i, datetime.datetime.now().time()

        #  parse file
        data = {}
        soup = boil_soup(item['urlid'])

        # given boilerplate
        data['boilerplate'] = [item['title'], item['body']]
       

        # extract text
        extractor = Extractor(extractor='ArticleExtractor', html=unicode(soup))
        data['boilerpipe'] = [extractor.getText()]

        # remove non-text tags
        for tag in ['script', 'style']:
            for el in soup.find_all(tag):
                el.extract()

        # extract text for each tag
        for tag in TAGS:
            items = []
            for el in soup.find_all(tag):
                el.extract()

                if tag == 'img':
                    try:
                        items.append(el['alt'])
                    except KeyError:
                        pass
                    try:
                        items.append(el['title'])
                    except KeyError:
                        pass
                else:
                    items.append(el.text)

            data[tag] = items

        # extract meta tags
        meta = soup.find_all('meta')
        for el in meta:
            prop = el.get('property') if el.get('property') else el.get('name')
            if not prop:
                continue
            prop = prop.lower()
            try:
                s = unicode(el['content'])
            except:
                continue

            data['meta-'+prop] = s.split(u',') if prop == 'keywords' else [s]

        # preprocess string
        for item in data:
            data[item] = map(clean_string, data[item])
            data[item] = filter(None, data[item])

        print >>f, json.dumps(data)

    f.close()
from multi_task_ridge import _multi_target_ridge as multi_task_ridge
beta_old = betas_indep
# beta_old = np.load("/auto/k8/meickenberg/cache/thresh_-1.00_0.80_mt_ridge_with_corr_gamma_300.00.npz")['beta']

import os
cachedir = os.environ["DEFAULT_CACHE_DIR"]

from delayed import make_delayed
from ridge import _multi_corr_score

X_train_raw = data.get_wordnet(mode="train")
X_train = make_delayed(X_train_raw, [2, 3, 4])
X_val_raw = data.get_wordnet(mode="val")
X_val = make_delayed(X_val_raw, [2, 3, 4])

Y_train = data.get_train()
Y_val = data.get_val()

print "Starting loop"
import time

for gamma in [100, 500, 1000, 5000, 10000]:
    t = time.time()
    print "evaluating gamma=%f" % gamma
    beta_new = multi_task_ridge(X_train, Y_train, 
                            M=M_matrix, gamma=gamma,
                            A=A, alpha=1.,
                            warmstart=beta_old,
                            maxiter=61)

    y_pred_new = X_val.dot(beta_new)
Ejemplo n.º 12
0
"""

import numpy as np
import data

#%% Simple Mean-based Modell


# define the logmean
def logmean(x):
    return np.exp(np.mean(np.log(x + 1))) - 1


# load the training data
print('load training data')
df_train = data.get_train(nrows=10000)

# compute the means for different configurations
print('compute means')
mean_tab = df_train.groupby('ProductId').agg({'AdjDemand': logmean})
mean_tab2 = df_train.groupby(['ProductId',
                              'ClientId']).agg({'AdjDemand': logmean})
global_mean = logmean(df_train['AdjDemand'])


# generate estimation for each ProductID-ClientID-pair
def estimate(key):
    key = tuple(key)  # key needs to be a tuple
    try:
        est = mean_tab2.at[key, 'AdjDemand']
    except KeyError:
numtime = 1000


# load stimuli
trnstim = data.get_wordnet("train")
valstim = data.get_wordnet("val")[90:]

delays = [2, 3, 4]
deltrnstim = np.hstack([np.roll(trnstim, d, 0) for d in delays])
delvalstim = np.hstack([np.roll(valstim, d, 0) for d in delays])

sdeltrnstim = scipy.sparse.csr_matrix(deltrnstim)

ebamask = cortex.get_roi_mask("MLfs", "20121210ML_auto1", roi="EBA")["EBA"] > 0

trndata = data.get_train(masked=ebamask)
# use first block for noise covariance estimation
valdata_repeats = data.get_val(masked=ebamask, repeats=True)[:90]
# use second and third block for evaluation
valdata = data.get_val(masked=ebamask)[90:]

# zscore it?
valdata_repeats = ((valdata_repeats -
                   valdata_repeats.mean(0)[np.newaxis, ...]) /
                   valdata_repeats.std(0)[np.newaxis, ...])

valdata_noise = valdata_repeats - valdata_repeats.mean(-1)[..., np.newaxis]



# fit Independent Ridge Regression
Ejemplo n.º 14
0
def main():
    data = get_train() + get_test()

    f = file('generated/extracted_text', 'w')

    for i, item in enumerate(data):
        # status update
        if (i % 500) == 0:
            print i, datetime.datetime.now().time()

        #  parse file
        data = {}
        soup = boil_soup(item['urlid'])

        # given boilerplate
        data['boilerplate'] = [item['title'], item['body']]

        # extract text
        extractor = Extractor(extractor='ArticleExtractor', html=unicode(soup))
        data['boilerpipe'] = [extractor.getText()]

        # remove non-text tags
        for tag in ['script', 'style']:
            for el in soup.find_all(tag):
                el.extract()

        # extract text for each tag
        for tag in TAGS:
            items = []
            for el in soup.find_all(tag):
                el.extract()

                if tag == 'img':
                    try:
                        items.append(el['alt'])
                    except KeyError:
                        pass
                    try:
                        items.append(el['title'])
                    except KeyError:
                        pass
                else:
                    items.append(el.text)

            data[tag] = items

        # extract meta tags
        meta = soup.find_all('meta')
        for el in meta:
            prop = el.get('property') if el.get('property') else el.get('name')
            if not prop:
                continue
            prop = prop.lower()
            try:
                s = unicode(el['content'])
            except:
                continue

            data['meta-' + prop] = s.split(u',') if prop == 'keywords' else [s]

        # preprocess string
        for item in data:
            data[item] = map(clean_string, data[item])
            data[item] = filter(None, data[item])

        print >> f, json.dumps(data)

    f.close()
Ejemplo n.º 15
0
    outdir = "../data/clean/"
    index = data.index_wiki('../data/wiki-pages')
    for file in tqdm(index.keys()):
        wiki = data.get_wiki(file)
        lines = wiki["lines"].apply(
            lambda l: "<SPLIT>".join(data.clean_article(l)))
        wiki["text"] = lines
        wiki = wiki.drop("lines", axis=1).reset_index()
        new_file = outdir + file.split("/")[-1]
        wiki.to_json(new_file, orient="records", lines=True)
    ###########################################################################
    #                                  Setup                                  #
    ###########################################################################

    # Load the data
    train = data.get_train("../data/train.jsonl")
    train = train.explode("evidence").reset_index()
    train, test = train_test_split(train)

    # Load the model
    embedder = ret.SentEmbed("distilroberta-base-msmarco-v2")

    # Build the dataset objects and loaders
    train_dataset = data.SentenceDataset(train, embedder, "../data/wiki.db", 4)
    test_dataset = data.SentenceDataset(test, embedder, "../data/wiki.db", 4)

    train_loader = DataLoader(
        train_dataset,
        batch_size=64,
        shuffle=True,
        collate_fn=train_dataset.collate,
Ejemplo n.º 16
0
sdeltrnstim = deltrnstim = np.nan_to_num(zs(deltrnstim))
sdelvalstim = delvalstim = np.nan_to_num(zs(delvalstim))

# Select some voxels
cort_mask = cortex.get_cortical_mask("MLfs", "20121210ML_auto1", "thick")
#rois = ["V1", "V2", "V3"]
rois = ["V1"]
masks = [cortex.get_roi_mask("MLfs",
                             "20121210ML_auto1",
                             roi=roi)[roi] > 0 for roi in rois]
roimask = reduce(lambda x, y: (x + y), masks)
wardmask = cort_mask - roimask

# Load training, test fMRI data
trndata_roi = np.nan_to_num(data.get_train(masked=roimask)[:numtime])
trndata_ward = np.nan_to_num(data.get_train(masked=wardmask)[:numtime])

connectivity = image.grid_to_graph(n_x=wardmask.shape[0],
                                   n_y=wardmask.shape[1],
                                   n_z=wardmask.shape[2],
    mask=wardmask)
ward = WardAgglomeration(n_clusters=numclusters, connectivity=connectivity,
                         memory='nilearn_cache')
ward.fit(trndata_ward)
labels = ward.labels_
trndata_collapsed = np.array([trndata_ward[:, labels == i].mean(1)
                              for i in range(numclusters)])
trndata = np.hstack((trndata_roi, trndata_collapsed.T))
valdata = data.get_val(masked=roimask)