Beispiel #1
0
def train_and_test(args, print_config):

    assert args.conv_layer_n == len(args.filter_widths) == len(args.nkerns) == (len(args.L2_regs) - 2) == len(args.fold_flags) == len(args.ks)

    # \mod{dim, 2^{\sum fold_flags}} == 0
    assert args.embed_dm % (2 ** sum(args.fold_flags)) == 0
    
    ###################
    # get the data    #
    ###################
    datasets = load_data(args.corpus_path)
    
    train_set_x, train_set_y = datasets[0]
    dev_set_x, dev_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]
    word2index = datasets[3]
    index2word = datasets[4]
    pretrained_embeddings = datasets[5]

    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / args.batch_size
    n_dev_batches = dev_set_x.get_value(borrow=True).shape[0] / args.dev_test_batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / args.dev_test_batch_size
    
    train_sent_len = train_set_x.get_value(borrow=True).shape[1]
    possible_labels =  set(train_set_y.get_value().tolist())
    
    if args.use_pretrained_embedding:
        args.embed_dm = pretrained_embeddings.get_value().shape[1]
        
    ###################################
    # Symbolic variable definition    #
    ###################################
    x = T.imatrix('x') # the word indices matrix
    y = T.ivector('y') # the sentiment labels

    batch_index = T.iscalar('batch_index')
    
    rng = np.random.RandomState(1234)
    
    ###############################
    # Construction of the network #
    ###############################
    # Layer 1, the embedding layer
    layer1 = WordEmbeddingLayer(rng, 
                                input = x, 
                                vocab_size = len(word2index),
                                embed_dm = args.embed_dm, 
                                embeddings = (
                                    pretrained_embeddings 
                                    if args.use_pretrained_embedding else None
                                )
    )
    
    dropout_layers = [layer1]
    layers = [layer1]
    
    for i in xrange(args.conv_layer_n):
        fold_flag = args.fold_flags[i]
        
        # for the dropout layer
        dpl = DropoutLayer(
            input = dropout_layers[-1].output,
            rng = rng, 
            dropout_rate = args.dropout_rates[0]
        ) 
        next_layer_dropout_input = dpl.output
        next_layer_input = layers[-1].output
        
        # for the conv layer
        filter_shape = (
            args.nkerns[i],
            (1 if i == 0 else args.nkerns[i-1]), 
            1, 
            args.filter_widths[i]
        )
        
        k = args.ks[i]
        
        print "For conv layer(%s) %d, filter shape = %r, k = %d, dropout_rate = %f and normalized weight init: %r and fold: %d" %(
            args.conv_activation_unit, 
            i+2, 
            filter_shape, 
            k, 
            args.dropout_rates[i], 
            args.norm_w, 
            fold_flag
        )
        
        # we have two layers adding to two paths repsectively, 
        # one for training
        # the other for prediction(averaged model)

        dropout_conv_layer = ConvFoldingPoolLayer(rng, 
                                                  input = next_layer_dropout_input,
                                                  filter_shape = filter_shape, 
                                                  k = k, 
                                                  norm_w = args.norm_w,
                                                  fold = fold_flag,
                                                  activation = args.conv_activation_unit)
    
        # for prediction
        # sharing weight with dropout layer
        conv_layer = ConvFoldingPoolLayer(rng, 
                                          input = next_layer_input,
                                          filter_shape = filter_shape,
                                          k = k,
                                          activation = args.conv_activation_unit,
                                          fold = fold_flag,
                                          W = dropout_conv_layer.W * (1 - args.dropout_rates[i]), # model averaging
                                          b = dropout_conv_layer.b
        )

        dropout_layers.append(dropout_conv_layer)
        layers.append(conv_layer)
    
    # last, the output layer
    # both dropout and without dropout
    if sum(args.fold_flags) > 0:
        n_in = args.nkerns[-1] * args.ks[-1] * args.embed_dm / (2**sum(args.fold_flags))
    else:
        n_in = args.nkerns[-1] * args.ks[-1] * args.embed_dm
        
    print "For output layer, n_in = %d, dropout_rate = %f" %(n_in, args.dropout_rates[-1])
    
    dropout_output_layer = LogisticRegression(
        rng,
        input = dropout_layers[-1].output.flatten(2), 
        n_in = n_in, # divided by 2x(how many times are folded)
        n_out = len(possible_labels) # five sentiment level
    )

    output_layer = LogisticRegression(
        rng,
        input = layers[-1].output.flatten(2), 
        n_in = n_in,
        n_out = len(possible_labels),
        W = dropout_output_layer.W * (1 - args.dropout_rates[-1]), # sharing the parameters, don't forget
        b = dropout_output_layer.b
    )
    
    dropout_layers.append(dropout_output_layer)
    layers.append(output_layer)

    ###############################
    # Error and cost              #
    ###############################
    # cost and error come from different model!
    dropout_cost = dropout_output_layer.nnl(y)
    errors = output_layer.errors(y)
    
    def prepare_L2_sqr(param_layers, L2_regs):
        assert len(L2_regs) == len(param_layers)
        return T.sum([
            L2_reg / 2 * ((layer.W if hasattr(layer, "W") else layer.embeddings) ** 2).sum()
            for L2_reg, layer in zip(L2_regs, param_layers)
        ])
    L2_sqr = prepare_L2_sqr(dropout_layers, args.L2_regs)
    L2_sqr_no_ebd = prepare_L2_sqr(dropout_layers[1:], args.L2_regs[1:])
    
    if args.use_L2_reg:
        cost = dropout_cost + L2_sqr
        cost_no_ebd = dropout_cost + L2_sqr_no_ebd
    else:
        cost = dropout_cost
        cost_no_ebd = dropout_cost
    
    ###############################
    # Parameters to be used       #
    ###############################
    print "Delay embedding learning by %d epochs" %(args.embedding_learning_delay_epochs)
        
    print "param_layers: %r" %dropout_layers
    param_layers = dropout_layers
    
    ##############################
    # Parameter Update           #
    ##############################
    print "Using AdaDelta with rho = %f and epsilon = %f" %(args.rho, args.epsilon)
    
    params = [param for layer in param_layers for param in layer.params]
    param_shapes=  [param for layer in param_layers for param in layer.param_shapes]                                
    
    param_grads = [T.grad(cost, param) for param in params]
        
    # AdaDelta parameter update
    # E[g^2]
    # initialized to zero
    egs = [
        theano.shared(
            value = np.zeros(param_shape,
                             dtype = theano.config.floatX
                         ),
            borrow = True,        
            name = "Eg:" + param.name
        )
        for param_shape, param in zip(param_shapes, params)
    ]
    
    # E[\delta x^2], initialized to zero
    exs = [
        theano.shared(
            value = np.zeros(param_shape,
                             dtype = theano.config.floatX
                         ),
            borrow = True,        
            name = "Ex:" + param.name
        )
        for param_shape, param in zip(param_shapes, params)
    ]        
    
    new_egs = [
        args.rho * eg + (1 - args.rho) * g ** 2
        for eg, g in zip(egs, param_grads)
    ]
        
    delta_x = [
        -(T.sqrt(ex + args.epsilon) / T.sqrt(new_eg + args.epsilon)) * g
        for new_eg, ex, g in zip(new_egs, exs, param_grads)
    ]    
    
    new_exs = [
        args.rho * ex + (1 - args.rho) * (dx ** 2)
        for ex, dx in zip(exs, delta_x)
    ]    
    
    egs_updates = zip(egs, new_egs)
    exs_updates = zip(exs, new_exs)
    param_updates = [
        (p, p + dx)
        for dx, g, p in zip(delta_x, param_grads, params)
    ]

    updates = egs_updates + exs_updates + param_updates
    
    # updates WITHOUT embedding
    # exclude the embedding parameter
    egs_updates_no_ebd = zip(egs[1:], new_egs[1:])
    exs_updates_no_ebd = zip(exs[1:], new_exs[1:])
    param_updates_no_ebd = [
        (p, p + dx)
        for dx, g, p in zip(delta_x, param_grads, params)[1:]
    ]
    updates_no_emb = egs_updates_no_ebd + exs_updates_no_ebd + param_updates_no_ebd
    
    def make_train_func(cost, updates):
        return theano.function(inputs = [batch_index],
                               outputs = [cost], 
                               updates = updates,
                               givens = {
                                   x: train_set_x[batch_index * args.batch_size: (batch_index + 1) * args.batch_size],
                                   y: train_set_y[batch_index * args.batch_size: (batch_index + 1) * args.batch_size]
                               }
        )        

    train_model_no_ebd = make_train_func(cost_no_ebd, updates_no_emb)
    train_model = make_train_func(cost, updates)

    def make_error_func(x_val, y_val):
        return theano.function(inputs = [],
                               outputs = errors, 
                               givens = {
                                   x: x_val,
                                   y: y_val
                               }, 
                           )
        
    dev_error = make_error_func(dev_set_x, dev_set_y)

    test_error = make_error_func(test_set_x, test_set_y)
    

    #############################
    # Debugging purpose code    #
    #############################
    # : PARAMETER TUNING NOTE:
    # some demonstration of the gradient vanishing probelm
    
    train_data_at_index = {
        x: train_set_x[batch_index * args.batch_size: (batch_index + 1) * args.batch_size],
    }

    train_data_at_index_with_y = {
        x: train_set_x[batch_index * args.batch_size: (batch_index + 1) * args.batch_size],
        y: train_set_y[batch_index * args.batch_size: (batch_index + 1) * args.batch_size]
    }

    if print_config["nnl"]:
        get_nnl = theano.function(
            inputs = [batch_index],
            outputs = dropout_cost,
            givens = {
                x: train_set_x[batch_index * args.batch_size: (batch_index + 1) * args.batch_size],
                y: train_set_y[batch_index * args.batch_size: (batch_index + 1) * args.batch_size]
            }
        )
        
    if print_config["L2_sqr"]:
        get_L2_sqr = theano.function(
            inputs = [],
            outputs = L2_sqr
        )

        get_L2_sqr_no_ebd = theano.function(
            inputs = [],
            outputs = L2_sqr_no_ebd
        )
        
    if print_config["grad_abs_mean"]:
        print_grads = theano.function(
            inputs = [], 
            outputs = [theano.printing.Print(param.name)(
                T.mean(T.abs_(param_grad))
            )
                       for param, param_grad in zip(params, param_grads)
                   ], 
            givens = {
                x: train_set_x,
                y: train_set_y
            }
        )

    activations = [
        l.output
        for l in dropout_layers[1:-1]
    ]
    weight_grads = [
        T.grad(cost, l.W)
        for l in dropout_layers[1:-1]
    ]

    if print_config["activation_hist"]:
        # turn into 1D array
        get_activations = theano.function(
            inputs = [batch_index], 
            outputs = [
                val.flatten(1)
                for val in activations
            ], 
            givens = train_data_at_index
        )

    if print_config["weight_grad_hist"]:
        # turn into 1D array
        get_weight_grads = theano.function(
            inputs = [batch_index], 
            outputs = [
                val.flatten(1)
                for val in weight_grads
            ], 
            givens = train_data_at_index_with_y
        )
        
    if print_config["activation_tracking"]:
        # get the mean and variance of activations for each conv layer                
        
        get_activation_mean = theano.function(
            inputs = [batch_index], 
            outputs = [
                T.mean(val)
                for val in activations
            ], 
            givens = train_data_at_index
        )

        get_activation_std = theano.function(
            inputs = [batch_index], 
            outputs = [
                T.std(val)
                for val in activations
            ], 
            givens = train_data_at_index
        )


    if print_config["weight_grad_tracking"]:
        # get the mean and variance of activations for each conv layer
        get_weight_grad_mean = theano.function(
            inputs = [batch_index], 
            outputs = [
                T.mean(g)
                for g in weight_grads
            ], 
            givens = train_data_at_index_with_y
        )

        get_weight_grad_std = theano.function(
            inputs = [batch_index], 
            outputs = [
                T.std(g)
                for g in weight_grads
            ], 
            givens = train_data_at_index_with_y
        )        
    
    #the training loop
    patience = args.patience  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                                  # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
                                  
    validation_frequency = min(n_train_batches, patience / 2)

    best_validation_loss = np.inf
    best_iter = 0

    start_time = time.clock()
    done_looping = False
    epoch = 0
    
    nnls = []
    L2_sqrs = []
    
    activation_means = [[] for i in xrange(args.conv_layer_n)]
    activation_stds = [[] for i in xrange(args.conv_layer_n)]
    weight_grad_means = [[] for i in xrange(args.conv_layer_n)]
    weight_grad_stds = [[] for i in xrange(args.conv_layer_n)]
    activation_hist_data = [[] for i in xrange(args.conv_layer_n)]
    weight_grad_hist_data = [[] for i in xrange(args.conv_layer_n)]

    train_errors = []
    dev_errors = []
    try:
        print "validation_frequency = %d" %validation_frequency
        while (epoch < args.n_epochs):
            epoch += 1
            print "At epoch {0}".format(epoch)

            if epoch == (args.embedding_learning_delay_epochs + 1):
                print "########################"
                print "Start training embedding"
                print "########################"

            # shuffle the training data        
            train_set_x_data = train_set_x.get_value(borrow = True)
            train_set_y_data = train_set_y.get_value(borrow = True)        
            
            permutation = np.random.permutation(train_set_x.get_value(borrow=True).shape[0])

            train_set_x.set_value(train_set_x_data[permutation])
            train_set_y.set_value(train_set_y_data[permutation])
            for minibatch_index in xrange(n_train_batches):
                if epoch >= (args.embedding_learning_delay_epochs + 1):
                    train_cost = train_model(minibatch_index)
                else:
                    train_cost = train_model_no_ebd(minibatch_index)


                iter = (epoch - 1) * n_train_batches + minibatch_index
                
                if (iter + 1) % validation_frequency == 0:

                    # train_error_val = np.mean([train_error(i)
                    #                            for i in xrange(n_train_batches)])
                    dev_error_val = dev_error()
                    
                    # print "At epoch %d and minibatch %d. \nTrain error %.2f%%\nDev error %.2f%%\n" %(
                    #     epoch, 
                    #     minibatch_index,
                    #     train_error_val * 100, 
                    #     dev_error_val * 100
                    # )

                    print "At epoch %d and minibatch %d. \nDev error %.2f%%\n" %(
                        epoch, 
                        minibatch_index,
                        dev_error_val * 100
                    )
                    
                    # train_errors.append(train_error_val)
                    dev_errors.append(dev_error_val)
                    
                    if dev_error_val < best_validation_loss:
                        best_iter = iter
                        #improve patience if loss improvement is good enough
                        if dev_error_val < best_validation_loss *  \
                           improvement_threshold:
                            patience = max(patience, iter * patience_increase)

                        best_validation_loss = dev_error_val

                        test_error_val = test_error()

                        print(
                           (
                               '     epoch %i, minibatch %i/%i, test error of'
                                ' best dev error %f %%'
                            ) %
                            (
                                epoch,
                                minibatch_index + 1,
                                n_train_batches,
                                test_error_val * 100.
                            )
                        )

                        print "Dumping model to %s" %(args.model_path)
                        dump_params(params, args.model_path)

                if (minibatch_index+1) % 50 == 0 or minibatch_index == n_train_batches - 1:
                    print "%d / %d minibatches completed" %(minibatch_index + 1, n_train_batches)                
                    if print_config["nnl"]:
                        print "`nnl` for the past 50 minibatches is %f" %(np.mean(np.array(nnls)))
                        nnls = []
                    if print_config["L2_sqr"]:
                        print "`L2_sqr`` for the past 50 minibatches is %f" %(np.mean(np.array(L2_sqrs)))
                        L2_sqrs = []                                                                            
                    
                ##################
                # Plotting stuff #
                ##################
                if print_config["nnl"]:
                    nnl = get_nnl(minibatch_index)
                    # print "nll for batch %d: %f" %(minibatch_index, nnl)
                    nnls.append(nnl)
                    
                if print_config["L2_sqr"]:
                    if epoch >= (args.embedding_learning_delay_epochs + 1):
                        L2_sqrs.append(get_L2_sqr())
                    else:
                        L2_sqrs.append(get_L2_sqr_no_ebd())
                    
                if print_config["activation_tracking"]:
                    layer_means = get_activation_mean(minibatch_index)
                    layer_stds = get_activation_std(minibatch_index)
                    for layer_ms, layer_ss, layer_m, layer_s in zip(activation_means, activation_stds, layer_means, layer_stds):
                        layer_ms.append(layer_m)
                        layer_ss.append(layer_s)

                if print_config["weight_grad_tracking"]:
                    layer_means = get_weight_grad_mean(minibatch_index)
                    layer_stds = get_weight_grad_std(minibatch_index)
                    
                    for layer_ms, layer_ss, layer_m, layer_s in zip(weight_grad_means, weight_grad_stds, layer_means, layer_stds):
                        layer_ms.append(layer_m)
                        layer_ss.append(layer_s)

                if print_config["activation_hist"]:
                    for layer_hist, layer_data in zip(activation_hist_data , get_activations(minibatch_index)):
                        layer_hist += layer_data.tolist()

                if print_config["weight_grad_hist"]:
                    for layer_hist, layer_data in zip(weight_grad_hist_data , get_weight_grads(minibatch_index)):
                        layer_hist += layer_data.tolist()
                                    
    except:
        import traceback
        traceback.print_exc(file = sys.stdout)
    finally:
        from plot_util import (plot_hist, 
                               plot_track, 
                               plot_error_vs_epoch, 
                               plt)

        if print_config["activation_tracking"]:
            plot_track(activation_means, 
                          activation_stds, 
                          "activation_tracking")

        if print_config["weight_grad_tracking"]:
            plot_track(weight_grad_means, 
                          weight_grad_stds,
                          "weight_grad_tracking")
            
        if print_config["activation_hist"]:        
            plot_hist(activation_hist_data, "activation_hist")

        if print_config["weight_grad_hist"]:
            plot_hist(weight_grad_hist_data, "weight_grad_hist")

        if print_config["error_vs_epoch"]:
            train_errors = [0] * len(dev_errors)
            ax = plot_error_vs_epoch(train_errors, dev_errors, 
                                     title = ('Best dev score: %f %% '
                                              ' at iter %i with test error %f %%') %(
                                                  best_validation_loss * 100., best_iter + 1, test_error_val * 100.
                                              )
            )
        if not args.task_signature:
            plt.show()
        else:
            plt.savefig("plots/" + args.task_signature + ".png")
    
    end_time = time.clock()
    
    print(('Optimization complete. Best validation score of %f %% '
           'obtained at iteration %i, with test performance %f %%') %
          (best_validation_loss * 100., best_iter + 1, test_error_val * 100.))
    
    # save the result
    with open(args.output, "a") as f:
        f.write("%s\t%f\t%f\n" %(args.task_signature, best_validation_loss, test_error_val))
        
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Beispiel #2
0
def main(train_path, validation_path, save_path):
    """Problem 2: Logistic regression for imbalanced labels.

    Run under the following conditions:
        1. naive logistic regression
        2. upsampling minority class

    Args:
        train_path: Path to CSV file containing training set.
        validation_path: Path to CSV file containing validation set.
        save_path: Path to save predictions.
    """
    output_path_naive = save_path.replace(WILDCARD, 'naive')
    output_path_upsampling = save_path.replace(WILDCARD, 'upsampling')

    # *** START CODE HERE ***
    # Part (b): Vanilla logistic regression
    # Make sure to save predicted probabilities to output_path_naive using np.savetxt()
    print("Vanilla Logistic Regression:")
    x_train, y_train = util.load_dataset(train_path, add_intercept=True)

    x_val, y_val = util.load_dataset(validation_path, add_intercept=True)

    clf = LogisticRegression()
    clf.fit(x_train, y_train)
    y_predict = clf.predict(x_val)

    np.savetxt(output_path_naive, y_predict)
    y_predict = y_predict >= 0.5
    util.plot(x_val, y_predict, clf.theta, output_path_naive[:-4])

    accuracy = np.mean(y_predict == y_val)
    A_0 = np.sum((y_predict == 0) * (y_val == 0)) / np.sum(y_val == 0)
    A_1 = np.sum((y_predict == 1) * (y_val == 1)) / np.sum(y_val == 1)
    balanced_accuracy = 0.5 * (A_0 + A_1)
    print("Accuracy: {},\nAccuracy for class 0: {},\nAccuracy for class 1: {},"
          "\nBalanced Accuracy: {}".format(accuracy, A_0, A_1,
                                           balanced_accuracy))

    #plot the real expected outcome from the validation:
    util.plot(x_val, y_val, clf.theta, output_path_naive[:-4] + "validation")
    # Part (d): Upsampling minority class
    # Make sure to save predicted probabilities to output_path_upsampling using np.savetxt()
    # Repeat minority examples 1 / kappa times
    num_add = int(1 / kappa) - 1

    x_train = np.concatenate(
        (x_train, np.repeat(x_train[y_train == 1, :], num_add, axis=0)),
        axis=0)
    y_train = np.concatenate(
        (y_train, np.repeat(y_train[y_train == 1], num_add, axis=0)), axis=0)

    x_val, y_val = util.load_dataset(validation_path, add_intercept=True)

    clf = LogisticRegression()
    clf.fit(x_train, y_train)
    y_predict = clf.predict(x_val)

    np.savetxt(output_path_upsampling, y_predict)
    y_predict = y_predict >= 0.5
    util.plot(x_val, y_predict, clf.theta, output_path_upsampling[:-4])

    accuracy = np.mean(y_predict == y_val)
    A_0 = np.sum((y_predict == 0) * (y_val == 0)) / np.sum(y_val == 0)
    A_1 = np.sum((y_predict == 1) * (y_val == 1)) / np.sum(y_val == 1)
    balanced_accuracy = 0.5 * (A_0 + A_1)
    print("Accuracy: {},\nAccuracy for class 0: {},\nAccuracy for class 1: {},"
          "\nBalanced Accuracy: {}".format(accuracy, A_0, A_1,
                                           balanced_accuracy))
    #plot the real expected outcome from the validation:
    util.plot(x_val, y_val, clf.theta,
              output_path_upsampling[:-4] + "validation")
Beispiel #3
0
        plt.plot(costs_train[i],
                 "--",
                 color=color,
                 label="Train, lambda = {:g}".format(lmbda))
        plt.plot(costs_test[i],
                 color=color,
                 label="Test, lambda = {:g}".format(lmbda))

    plt.legend(loc="upper right")
    plt.savefig("results/cost_lmbda.pdf")
    plt.show()

if mode == "logreg":
    batch_size = 100
    n_batches = int(Xtrain.shape[0] / batch_size)
    logReg = LogisticRegression(n_batches=n_batches, allow_early_stop=False)

    etas = [1, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5]
    acc_list = []

    accuracys_train = []
    costs_train = []
    accuracys_test = []
    costs_test = []

    for eta in etas:
        a, b, c, d = logReg.fit(Xtrain,
                                ytrain,
                                eta=eta,
                                n_epochs=2000,
                                Xtest=Xtest,
def evaluatePerformance(numTrials=1000):
    '''
    Evaluate the performance of decision trees and logistic regression,
    average over 1,000 trials of 10-fold cross validation

    Return:
      a matrix giving the performance that will contain the following entries:
      stats[0,0] = mean accuracy of decision tree
      stats[0,1] = std deviation of decision tree accuracy
      stats[1,0] = mean accuracy of logistic regression
      stats[1,1] = std deviation of logistic regression accuracy

    ** Note that your implementation must follow this API**
    '''

    # Load Data
    filename = 'data/SPECTF.dat'
    data = np.loadtxt(filename, delimiter=',')
    X = data[:, 1:]
    y = np.array([data[:, 0]]).T
    n,d = X.shape

    # Standardize the data
    mean = X.mean(axis=0)
    std = X.std(axis=0)
    X = (X - mean) / std

    #1000 trials
    num_folds = 10
    percent_incs = 10
    tree_accuracy = np.zeros(shape=[numTrials*num_folds,percent_incs])
    log_accuracy = np.zeros(shape=[numTrials*num_folds,percent_incs])

    #split the data
    k_fold = sklearn.cross_validation.KFold(len(y), n_folds=num_folds)

    for i in xrange(numTrials):
        #for each trial, shuffle the data
        #print 'Iteration: ', i+1
        idx = np.arange(n)
        np.random.seed(13)
        np.random.shuffle(idx)
        X = X[idx]
        y = y[idx]

        j = 0
        for train_index, test_index in k_fold:
            for k in xrange(percent_incs):
                #get the data splits for the current fold
                Xtrain, Xtest = X[train_index[0:(n/percent_incs)*(k+1)]], X[test_index]
                ytrain, ytest = y[train_index[0:(n/percent_incs)*(k+1)]], y[test_index]

                # train the decision tree
                clf = tree.DecisionTreeClassifier()
                clf = clf.fit(Xtrain, ytrain)

                # output tree predictions on the remaining data and check them
                tree_pred = clf.predict(Xtest)
                tree_accuracy[i*num_folds + j,k] = accuracy_score(ytest, tree_pred)

                #train logarithmic regression
                logregModel = LogisticRegression(alpha = 0.1, epsilon = 0.005)
                logregModel.fit(Xtrain, ytrain)

                #output logreg predictions on the remaining data and check them
                log_pred = logregModel.predict(Xtest)
                log_accuracy[i*num_folds + j,k] = accuracy_score(ytest, log_pred)

            j += 1

    # compute the training accuracy of the model
    meanDecisionTreeAccuracy = np.mean(tree_accuracy[:,percent_incs-1])

    # TODO: update these statistics based on the results of your experiment
    stddevDecisionTreeAccuracy = np.std(tree_accuracy[:,percent_incs-1])
    meanLogisticRegressionAccuracy = np.mean(log_accuracy[:,percent_incs-1])
    stddevLogisticRegressionAccuracy = np.std(log_accuracy[:,percent_incs-1])

    #print graph
    tree_array = np.zeros(percent_incs)
    tree_array_std = np.zeros(percent_incs)
    log_array = np.zeros(percent_incs)
    log_array_std = np.zeros(percent_incs)
    for i in xrange(percent_incs):
        tree_array[i] = np.mean(tree_accuracy[:,i])
        tree_array_std[i] = np.std(tree_accuracy[:,i])
        log_array[i] = np.mean(log_accuracy[:,i])
        log_array_std[i] = np.std(log_accuracy[:,i])

    x_axis = (np.arange(percent_incs) + 1) * 10
    tree_plot = plt.errorbar(x=x_axis, y=tree_array, yerr=tree_array_std)
    log_plot = plt.errorbar(x=x_axis, y=log_array, yerr=log_array_std)
    plt.xlabel('Training Data Used (percentage)')
    plt.ylabel('Accuracy (mean)')
    plt.title('Learning Curve')
    plt.axis([10, 100, 0.0, 1.0])
    plt.grid(True)
    plt.legend([tree_plot, log_plot], ["Decision Tree", "Logistic Regression"], loc=4)

    plt.savefig('learningcurve.pdf')
    #plt.show()

    # make certain that the return value matches the API specification
    stats = np.zeros((2,2))
    stats[0,0] = meanDecisionTreeAccuracy
    stats[0,1] = stddevDecisionTreeAccuracy
    stats[1,0] = meanLogisticRegressionAccuracy
    stats[1,1] = stddevLogisticRegressionAccuracy
    return stats
Beispiel #5
0
    def __init__(self, x, y, batch_size, videos, kernels, pools, n_input, n_output, hidden_input, params=None):
        learning_rate = 0.1
        rng = numpy.random.RandomState(1234)

        print '... building the model'
        sys.stdout.flush()

        if not params:
            # Construct the first convolutional pooling layer:
            # filtering reduces the image size to (28-5+1,28-5+1)=(24,24)
            # maxpooling reduces this further to (24/2,24/2) = (12,12)
            # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12)
            layer0 = ConvLayer(x, n_input[0], n_output[0], kernels[0], videos[0], pools[0],
                               batch_size, 'L0', rng)

            layer1 = ConvLayer(layer0.output, n_input[1], n_output[1], kernels[1], videos[1], pools[1],
                               batch_size, 'L1', rng)

            layer2_input = layer1.output.flatten(2)

            # construct a fully-connected sigmoidal layer
            layer2 = HiddenLayer(rng, input=layer2_input, n_in=hidden_input,
                                 n_out=batch_size, activation=T.tanh)

            # classify the values of the fully-connected sigmoidal layer
            layer3 = LogisticRegression(input=layer2.output, n_in=batch_size, n_out=2)
        else:

            layer0 = ConvLayer(x, n_input[0], n_output[0], kernels[0], videos[0], pools[0],
                               batch_size, 'L0', rng, True, params[6], params[7])

            layer1 = ConvLayer(layer0.output, n_input[1], n_output[1], kernels[1], videos[1], pools[1],
                               batch_size, 'L1', rng, True, params[4], params[5])

            layer2_input = layer1.output.flatten(2)

            # construct a fully-connected sigmoidal layer
            layer2 = HiddenLayer(rng, input=layer2_input, n_in=hidden_input,
                                 n_out=batch_size, activation=T.tanh, W=params[2], b=params[3])

            # classify the values of the fully-connected sigmoidal layer
            layer3 = LogisticRegression(input=layer2.output, n_in=batch_size, n_out=2, W=params[0], b=params[1])

        # the cost we minimize during training is the NLL of the model
        cost = layer3.negative_log_likelihood(y)

        # create a list of all model parameters to be fit by gradient descent
        self.params = layer3.params + layer2.params + layer1.params + layer0.params

        # create a list of gradients for all model parameters
        grads = T.grad(cost, self.params)

        # train_model is a function that updates the model parameters by
        # SGD Since this model has many parameters, it would be tedious to
        # manually create an update rule for each model parameter. We thus
        # create the updates list by automatically looping over all
        # (params[i],grads[i]) pairs.
        updates = []
        for param_i, grad_i in zip(self.params, grads):
            updates.append((param_i, param_i - learning_rate * grad_i))

        self.train_model = theano.function([x, y], cost, updates=updates)
        self.validate_model = theano.function(inputs=[x, y], outputs=layer3.errors(y))
        self.predict = theano.function(inputs=[x], outputs=layer3.y_pred)

        print '... building done'
        sys.stdout.flush()
if __name__ == "__main__":
    # Load Data
    filename = 'data/data1.dat'
    data = loadtxt(filename, delimiter=',')
    X = data[:, 0:2]
    y = np.array([data[:, 2]]).T
    n, d = X.shape

    # Standardize the data
    mean = X.mean(axis=0)
    std = X.std(axis=0)
    X = (X - mean) / std

    # train logistic regression
    logregModel = LogisticRegression(regLambda=0.0001)
    logregModel.fit(X, y)

    # Plot the decision boundary
    h = .02  # step size in the mesh
    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    Z = logregModel.predict(np.c_[xx.ravel(), yy.ravel()])
    print Z

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure(1, figsize=(4, 3))
    plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)
def main():

    print "############# Load Datasets ##############"

    import stanfordSentimentTreebank as sst

    skip_unknown_words = bool(args.get("--skip"))
    shuffle_flag = bool(args.get("--shuffle"))
    datatype = args.get("--datatype")
    if datatype == 5:
        # Fine-grained 5-class
        n_class = 5
    elif datatype == 2:
        # Binary 2-class
        n_class = 2

    # print "skip_unknown_words",skip_unknown_words
    vocab, index2word, datasets, datasets_all_sentences, funcs = sst.load_stanfordSentimentTreebank_dataset(
        normalize=True, skip_unknown_words=skip_unknown_words, datatype=datatype
    )
    train_set, test_set, dev_set = datasets
    train_set_sentences, test_set_sentences, dev_set_sentences = datasets_all_sentences
    get, sentence2ids, ids2sentence = funcs  # 関数を読み込み
    scores, sentences = zip(*train_set_sentences)
    sentences = [[word for word in sentence.lower().split()] for sentence in sentences]
    vocab_size = len(vocab)

    dev_unknown_count = sum([unknown_word_count for score, (ids, unknown_word_count) in dev_set])
    test_unknown_count = sum([unknown_word_count for score, (ids, unknown_word_count) in test_set])

    train_set = [(score, ids) for score, (ids, unknown_word_count) in train_set]
    test_set = [(score, ids) for score, (ids, unknown_word_count) in test_set]
    dev_set = [(score, ids) for score, (ids, unknown_word_count) in dev_set]

    print "train_size : ", len(train_set)
    print "dev_size   : ", len(dev_set)
    print "test_size  : ", len(test_set)
    print "-" * 30
    print "vocab_size: ", len(vocab)
    print "dev_unknown_words  : ", dev_unknown_count
    print "test_unknown_words : ", test_unknown_count

    print args

    # EMB_DIM = 50
    EMB_DIM = args.get("--emb_size")
    vocab_size = len(vocab)

    feat_map_n_1 = args.get("--feat_map_n_1")
    feat_map_n_final = args.get("--feat_map_n_final")

    height = 1
    width1 = args.get("--width1")
    width2 = args.get("--width2")
    k_top = args.get("--k_top")
    n_class = n_class
    alpha = args.get("--alpha")
    n_epoch = args.get("--n_epoch")
    dropout_rate0 = args.get("--dropout_rate0")
    dropout_rate1 = args.get("--dropout_rate1")
    dropout_rate2 = args.get("--dropout_rate2")
    activation = args.get("--activation")
    learn = args.get("--learn")
    number_of_convolutinal_layer = 2

    pretrain = args.get("--pretrain")
    if pretrain == "word2vec":
        print "*Using word2vec"
        embeddings_W, model = pretrained_embedding.use_word2vec(
            sentences=sentences, index2word=index2word, emb_dim=EMB_DIM
        )
        # -0.5 ~ 0.5で初期化している
    elif pretrain == "glove":
        print "*Using glove"
        embeddings_W = pretrained_embedding.use_glove(
            sentences=sentences,
            index2word=index2word,
            emb_dim=EMB_DIM,
            model_file="glove_model/glove_50_iter2900.model",
        )
    else:
        embeddings_W = np.asarray(rng.normal(0, 0.05, size=(vocab_size, EMB_DIM)), dtype=theano.config.floatX)
        embeddings_W[0, :] = 0

    print np.amax(embeddings_W)
    print np.amin(embeddings_W)
    # print "*embeddings"
    print embeddings_W
    # print bool(embeddings)

    # input_x = [1, 3, 4, 5, 0, 22, 4, 5]

    print "############# Model Setting ##############"
    x = T.imatrix("x")
    length_x = T.iscalar("length_x")
    y = T.ivector("y")  # the sentence sentiment label
    embeddings = WordEmbeddingLayer(rng=rng, input=x, vocab_size=vocab_size, embed_dm=EMB_DIM, embeddings=embeddings_W)

    def dropout(X, p=0.5):
        if p > 0:
            retain_prob = 1 - p
            X *= srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX)
            # X /= retain_prob
        return X

    # number_of_convolutinal_layer = theano.shared(number_of_convolutinal_layer)
    # dynamic_func = theano.function(inputs=[length_x], outputs=number_of_convolutinal_layer * length_x)

    # dynamic_func_test = theano.function(
    #     inputs = [length_x],
    #     outputs = dynamic_func(length_x),
    #     )
    # print dynamic_func(len([1,2,3]))

    l1 = DynamicConvFoldingPoolLayer(
        rng,
        input=dropout(embeddings.output, p=dropout_rate0),
        filter_shape=(feat_map_n_1, 1, height, width1),  # two feature map, height: 1, width: 2,
        k_top=k_top,
        number_of_convolutinal_layer=number_of_convolutinal_layer,
        index_of_convolitonal_layer=1,
        length_x=length_x,
        activation=activation,
    )
    l1_no_dropout = DynamicConvFoldingPoolLayer(
        rng,
        input=embeddings.output,
        W=l1.W * (1 - dropout_rate0),
        b=l1.b,
        filter_shape=(feat_map_n_1, 1, height, width1),  # two feature map, height: 1, width: 2,
        k_top=k_top,
        number_of_convolutinal_layer=number_of_convolutinal_layer,
        index_of_convolitonal_layer=1,
        length_x=length_x,
        activation=activation,
    )

    l2 = DynamicConvFoldingPoolLayer(
        rng,
        input=dropout(l1.output, p=dropout_rate1),
        filter_shape=(feat_map_n_final, feat_map_n_1, height, width2),
        # two feature map, height: 1, width: 2,
        k_top=k_top,
        number_of_convolutinal_layer=number_of_convolutinal_layer,
        index_of_convolitonal_layer=2,
        length_x=length_x,
        activation=activation,
    )
    l2_no_dropout = DynamicConvFoldingPoolLayer(
        rng,
        input=l1_no_dropout.output,
        W=l2.W * (1 - dropout_rate1),
        b=l2.b,
        filter_shape=(feat_map_n_final, feat_map_n_1, height, width2),
        # two feature map, height: 1, width: 2,
        k_top=k_top,
        number_of_convolutinal_layer=number_of_convolutinal_layer,
        index_of_convolitonal_layer=2,
        length_x=length_x,
        activation=activation,
    )

    # l2_output = theano.function(
    #     inputs = [x,length_x],
    #     outputs = l2.output,
    #     # on_unused_input='ignore'
    # )

    # TODO:
    # check the dimension
    # input: 1 x 1 x 6 x 4
    # out = l2_output(
    #     np.array([input_x], dtype = np.int32),
    #     len(input_x),
    # )

    # test = theano.function(
    #     inputs = [x],
    #     outputs = embeddings.output,
    # )

    # print "--input--"
    # print np.array([input_x], dtype = np.int32).shape
    # print "--input embeddings--"
    # a = np.array([input_x], dtype = np.int32)
    # print test(a).shape
    # print "-- output --"
    # print out
    # print out.shape

    # x = T.dscalar("x")
    # b = T.dscalar("b")
    # a = 1
    # f = theano.function(inputs=[x,b], outputs=b * x + a)
    # print f(2,2)

    # expected = (1, feat_map_n, EMB_DIM / 2, k)
    # assert out.shape == expected, "%r != %r" %(out.shape, expected)

    ##### Test Part Three ###############
    # LogisticRegressionLayer
    #################################

    # print "############# LogisticRegressionLayer ##############"

    l_final = LogisticRegression(
        rng,
        input=dropout(l2.output.flatten(2), p=dropout_rate2),
        n_in=feat_map_n_final * k_top * EMB_DIM,
        # n_in = feat_map_n * k * EMB_DIM / 2, # we fold once, so divide by 2
        n_out=n_class,  # five sentiment level
    )

    l_final_no_dropout = LogisticRegression(
        rng,
        input=l2_no_dropout.output.flatten(2),
        W=l_final.W * (1 - dropout_rate2),
        b=l_final.b,
        n_in=feat_map_n_final * k_top * EMB_DIM,
        # n_in = feat_map_n * k * EMB_DIM / 2, # we fold once, so divide by 2
        n_out=n_class,  # five sentiment level
    )

    print "n_in : ", feat_map_n_final * k_top * EMB_DIM
    # print "n_in = %d" %(2 * 2 * math.ceil(EMB_DIM / 2.))

    # p_y_given_x = theano.function(
    #     inputs = [x, length_x],
    #     outputs = l_final.p_y_given_x,
    #     allow_input_downcast=True,
    #     # mode = "DebugMode"
    # )

    # print "p_y_given_x = "
    # print p_y_given_x(
    #     np.array([input_x], dtype=np.int32),
    #     len(input_x)
    # )

    cost = theano.function(
        inputs=[x, length_x, y],
        outputs=l_final.nnl(y),
        allow_input_downcast=True,
        # mode = "DebugMode"
    )

    # print "cost:\n", cost(
    #     np.array([input_x], dtype = np.int32),
    #     len(input_x),
    #     np.array([1], dtype = np.int32)
    # )

    print "############# Learning ##############"
    layers = []
    layers.append(embeddings)
    layers.append(l1)
    layers.append(l2)
    layers.append(l_final)

    cost = l_final.nnl(y)

    params = [p for layer in layers for p in layer.params]
    param_shapes = [l.param_shapes for l in layers]
    param_grads = [T.grad(cost, param) for param in params]

    def sgd(cost, params, lr=0.05):
        grads = [T.grad(cost, param) for param in params]
        updates = []
        for p, g in zip(params, grads):
            updates.append([p, p - g * lr])
        return updates

    from sgd import rmsprop, adagrad, adadelta, adam

    # updates = sgd(cost, l_final.params)

    # print param_grads
    if learn == "sgd":
        updates = sgd(cost, params, lr=0.05)
    elif learn == "adam":
        updates = adam(loss_or_grads=cost, params=params, learning_rate=alpha)
    elif learn == "adagrad":
        updates = adagrad(loss_or_grads=cost, params=params, learning_rate=alpha)
    elif learn == "adadelta":
        updates = adadelta(loss_or_grads=cost, params=params)
    elif learn == "rmsprop":
        updates = rmsprop(loss_or_grads=cost, params=params, learning_rate=alpha)

    train = theano.function(inputs=[x, length_x, y], outputs=cost, updates=updates, allow_input_downcast=True)
    # predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)
    predict = theano.function(
        inputs=[x, length_x],
        outputs=T.argmax(l_final_no_dropout.p_y_given_x, axis=1),
        allow_input_downcast=True,
        # mode = "DebugMode"
    )

    def b(x_data):
        return np.array(x_data, dtype=np.int32)

    def test(test_set):
        # print "############# TEST ##############"
        y_pred = []
        test_set_y = []
        # for train_x, train_y in zip(X_data, Y_data):
        # print test_set
        # Accuracy_count = 0
        for test_y, test_x in test_set:
            test_x = b([test_x])
            p = predict(test_x, len(test_x))[0]
            y_pred.append(p)
            test_set_y.append(test_y)

            # if test_y == p:
            #     Accuracy_count += 1

            # print "*predict :",predict(train_x, len(train_x)), train_y
        # Accuracy = float(Accuracy_count) / len(test_set)
        # print "  accuracy : %f" % Accuracy,
        return accuracy_score(test_set_y, y_pred)
        # print classification_report(test_set_y, y_pred)

    # train_set_rand = np.ndarray(train_set)
    train_set_rand = train_set[:]
    train_cost_sum = 0.0
    for epoch in xrange(n_epoch):
        print "== epoch : %d ==" % epoch
        if shuffle_flag:
            np.random.shuffle(train_set_rand)
            # train_set_rand = np.random.permutation(train_set)
        for i, x_y_set in enumerate(train_set_rand):
            train_y, train_x = x_y_set
            train_x = b([train_x])
            train_y = b([train_y])

            train_cost = train(train_x, len(train_x), train_y)
            train_cost_sum += train_cost
            if i % 1000 == 0 or i == len(train_set) - 1:
                print "i : (%d/%d)" % (i, len(train_set)),
                print " (cost : %f )" % train_cost

        print "  cost :", train_cost_sum
        print "  train_set : %f" % test(train_set)
        print "  dev_set   : %f" % test(dev_set)
        print "  test_set  : %f" % test(test_set)

    """
Beispiel #8
0
def main(train_path, valid_path, test_path, save_path):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on t-labels,
        2. on y-labels,
        3. on y-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        save_path: Path to save predictions.
    """
    output_path_true = save_path.replace(WILDCARD, 'true')
    output_path_naive = save_path.replace(WILDCARD, 'naive')
    output_path_adjusted = save_path.replace(WILDCARD, 'adjusted')

    # *** START CODE HERE ***
    # Part (a): Train and test on true labels
    x_train, y_train = util.load_dataset(train_path,
                                         add_intercept=True,
                                         label_col='t')
    x_valid, y_valid = util.load_dataset(valid_path,
                                         add_intercept=True,
                                         label_col='t')
    from logreg import LogisticRegression
    clf = LogisticRegression()
    clf.fit(x_train, y_train)
    print(clf.theta)

    fig, ax = plt.subplots(1, 1, figsize=(12, 8))

    ax.scatter(x_valid[:, 1], x_valid[:, 2], c=y_valid.astype(np.int))
    ax.set_ylim(x_valid[:, 2].min(), x_valid[:, 2].max())
    plot_decision_line(clf.theta, x_valid, ax)
    plt.savefig("posonly_all_observed.png")
    plt.show()

    # Make sure to save predicted probabilities to output_path_true using np.savetxt()
    # Part (b): Train on y-labels and test on true labels
    x_train, y_train = util.load_dataset(train_path,
                                         add_intercept=True,
                                         label_col='y')
    x_valid, y_valid = util.load_dataset(valid_path,
                                         add_intercept=True,
                                         label_col='y')
    from logreg import LogisticRegression
    clf = LogisticRegression()
    clf.fit(x_train, y_train)
    print(clf.theta)

    fig, ax = plt.subplots(1, 1, figsize=(12, 8))

    ax.scatter(x_valid[:, 1], x_valid[:, 2], c=y_valid.astype(np.int))
    ax.set_ylim(x_valid[:, 2].min(), x_valid[:, 2].max())
    plot_decision_line(clf.theta, x_valid, ax)
    plt.savefig("naive_training_partial.png")
    plt.show()
    # Make sure to save predicted probabilities to output_path_naive using np.savetxt()
    # Part (f): Apply correction factor using validation set and test on true labels
    clf = LogisticRegression()
    clf.fit(x_train, y_train)

    #decition
    y_pred = clf.predict(x_valid)
    print(y_pred)

    fig, ax = plt.subplots(1, 1, figsize=(12, 8))

    ax.scatter(x_valid[:, 1], x_valid[:, 2], c=y_valid.astype(np.int))
    ax.set_ylim(x_valid[:, 2].min(), x_valid[:, 2].max())
    plt.show()
Beispiel #9
0
#coding:utf-8

import sys
from sklearn.externals import joblib
from question71 import makeStoplist
from question72 import extractFeaturesFromString
from logreg import LogisticRegression

if __name__ == "__main__":

    vectorizer = joblib.load("tfidf.vec")
    clf = LogisticRegression("logreg")
    stoplist = makeStoplist()
    while True:
        test = input()
        test = extractFeaturesFromString(test, stoplist)
        print(["-1",
               "+1"][clf.predict(vectorizer.transform([" ".join(test)]))[0]])
        sys.stdout.flush()
Beispiel #10
0
index = 27
plt.imshow(train_set_x_orig[index])
plt.show()
print ("y = " + str(train_set_y[:, index]) + ", it's a '" + classes[np.squeeze(train_set_y[:, index])].decode("utf-8") +  "' picture.")
'''

# Flatten the images
train_set_x_flatten = train_set_x_orig.reshape(train_set_x_orig.shape[0], -1).T
test_set_x_flatten = test_set_x_orig.reshape(test_set_x_orig.shape[0], -1).T

# Normalise image values
train_set_x = train_set_x_flatten / 255.
test_set_x = test_set_x_flatten / 255.

# Create model instance
model = LogisticRegression()

# Fit model to the data
model.fit(train_set_x, train_set_y)

# Train the model
model.train(2400, verbose=True)

# Predict values
predictions = model.predict(test_set_x)

# Check accuracy
model.print_accuracy(predictions, test_set_y)

# Plot training loss
model.plot_cost()
def main():

    print "############# Load Datasets ##############"

    import stanfordSentimentTreebank as sst

    skip_unknown_words = bool(args.get("--skip"))
    shuffle_flag = bool(args.get("--shuffle"))
    datatype = args.get("--datatype")
    if datatype == 5:
        # Fine-grained 5-class
        n_class = 5
    elif datatype == 2:
        # Binary 2-class
        n_class = 2

    # print "skip_unknown_words",skip_unknown_words
    vocab, index2word, datasets, datasets_all_sentences, funcs = sst.load_stanfordSentimentTreebank_dataset(normalize=True, skip_unknown_words=skip_unknown_words, datatype=datatype)
    train_set, test_set, dev_set  = datasets
    train_set_sentences, test_set_sentences, dev_set_sentences = datasets_all_sentences
    get,sentence2ids, ids2sentence = funcs # 関数を読み込み
    scores, sentences = zip(*train_set_sentences)
    sentences = [[word for word in sentence.lower().split()] for sentence in sentences]
    vocab_size = len(vocab)

 
    dev_unknown_count  = sum([unknown_word_count for score,(ids,unknown_word_count) in dev_set])
    test_unknown_count = sum([unknown_word_count for score,(ids,unknown_word_count) in test_set])

    train_set = [(score, ids) for score,(ids,unknown_word_count) in train_set]
    test_set  = [(score, ids) for score,(ids,unknown_word_count) in test_set]
    dev_set   = [(score, ids) for score,(ids,unknown_word_count) in dev_set]

    print "train_size : ", len(train_set)
    print "dev_size   : ", len(dev_set)
    print "test_size  : ", len(test_set)
    print "-"*30
    print "vocab_size: ", len(vocab)
    print "dev_unknown_words  : ", dev_unknown_count
    print "test_unknown_words : ", test_unknown_count



    
    print args

    # EMB_DIM = 50
    EMB_DIM = args.get("--emb_size")
    vocab_size = len(vocab)


    feat_map_n_1 = args.get("--feat_map_n_1")
    feat_map_n_final = args.get("--feat_map_n_final")

    height = 1
    width1 = args.get("--width1")
    width2 = args.get("--width2")
    k_top  = args.get("--k_top")
    n_class = n_class
    alpha   = args.get("--alpha")
    n_epoch = args.get("--n_epoch")
    dropout_rate0 = args.get("--dropout_rate0")
    dropout_rate1 = args.get("--dropout_rate1")
    dropout_rate2 = args.get("--dropout_rate2")
    activation = args.get("--activation")
    learn      = args.get("--learn")
    number_of_convolutinal_layer = 2
    use_regular = bool(args.get("--use_regular"))
    regular_c   = args.get("--regular_c")

    pretrain = args.get('--pretrain')
    if pretrain == 'word2vec':
        print "*Using word2vec"
        embeddings_W, model = pretrained_embedding.use_word2vec(sentences=sentences, index2word=index2word, emb_dim=EMB_DIM)
        # -0.5 ~ 0.5で初期化している
    elif pretrain == 'glove':
        print "*Using glove"
        embeddings_W = pretrained_embedding.use_glove(sentences=sentences, index2word=index2word, emb_dim=EMB_DIM, model_file='glove_model/glove_50_iter2900.model')
    else:
        embeddings_W = np.asarray(
            rng.normal(0, 0.05, size = (vocab_size, EMB_DIM)), 
            dtype = theano.config.floatX
        )
        embeddings_W[0,:] = 0

    print np.amax(embeddings_W)
    print np.amin(embeddings_W)
    # print "*embeddings"
    print embeddings_W
    # print bool(embeddings)

    # input_x = [1, 3, 4, 5, 0, 22, 4, 5]

    print "############# Model Setting ##############"    
    x = T.imatrix('x')
    length_x = T.iscalar('length_x')
    y = T.ivector('y') # the sentence sentiment label
    embeddings = WordEmbeddingLayer(rng=rng, 
                            input=x,
                            vocab_size=vocab_size, embed_dm=EMB_DIM, embeddings=embeddings_W)


    def dropout(X, p=0.5):
        if p > 0:
            retain_prob = 1 - p
            X *= srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX)
            # X /= retain_prob
        return X
    # number_of_convolutinal_layer = theano.shared(number_of_convolutinal_layer)
    # dynamic_func = theano.function(inputs=[length_x], outputs=number_of_convolutinal_layer * length_x)

    # dynamic_func_test = theano.function(
    #     inputs = [length_x],
    #     outputs = dynamic_func(length_x),
    #     )
    # print dynamic_func(len([1,2,3]))

    l1 = DynamicConvFoldingPoolLayer(rng, 
                              input = dropout(embeddings.output, p=dropout_rate0), 
                              filter_shape = (feat_map_n_1, 1, height, width1),  # two feature map, height: 1, width: 2, 
                              k_top = k_top,
                              number_of_convolutinal_layer=number_of_convolutinal_layer,
                              index_of_convolitonal_layer=1,
                              length_x=length_x,
                              activation = activation
    )
    l1_no_dropout = DynamicConvFoldingPoolLayer(rng, 
                              input = embeddings.output,
                              W=l1.W * (1 - dropout_rate0),
                              b=l1.b,
                              filter_shape = (feat_map_n_1, 1, height, width1),  # two feature map, height: 1, width: 2, 
                              k_top = k_top,
                              number_of_convolutinal_layer=number_of_convolutinal_layer,
                              index_of_convolitonal_layer=1,
                              length_x=length_x,
                              activation = activation
    )


    l2 = DynamicConvFoldingPoolLayer(rng, 
                              input = dropout(l1.output, p=dropout_rate1), 
                              filter_shape = (feat_map_n_final, feat_map_n_1, height, width2),
                              # two feature map, height: 1, width: 2, 
                              k_top = k_top,
                              number_of_convolutinal_layer=number_of_convolutinal_layer,
                              index_of_convolitonal_layer=2,
                              length_x=length_x,
                              activation = activation
    )
    l2_no_dropout = DynamicConvFoldingPoolLayer(rng, 
                              input = l1_no_dropout.output,
                              W=l2.W * (1 - dropout_rate1),
                              b=l2.b,
                              filter_shape = (feat_map_n_final, feat_map_n_1, height, width2),
                              # two feature map, height: 1, width: 2, 
                              k_top = k_top,
                              number_of_convolutinal_layer=number_of_convolutinal_layer,
                              index_of_convolitonal_layer=2,
                              length_x=length_x,
                              activation = activation
    )


    # l2_output = theano.function(
    #     inputs = [x,length_x],
    #     outputs = l2.output,
    #     # on_unused_input='ignore'
    # ) 

    # TODO:
    # check the dimension
    # input: 1 x 1 x 6 x 4
    # out = l2_output(
    #     np.array([input_x], dtype = np.int32),
    #     len(input_x),
    # )


    # test = theano.function(
    #     inputs = [x],
    #     outputs = embeddings.output,
    # ) 


    # print "--input--"
    # print np.array([input_x], dtype = np.int32).shape
    # print "--input embeddings--"
    # a = np.array([input_x], dtype = np.int32)
    # print test(a).shape
    # print "-- output --"
    # print out
    # print out.shape



    # x = T.dscalar("x")
    # b = T.dscalar("b")
    # a = 1
    # f = theano.function(inputs=[x,b], outputs=b * x + a)
    # print f(2,2)


    # expected = (1, feat_map_n, EMB_DIM / 2, k)
    # assert out.shape == expected, "%r != %r" %(out.shape, expected)

    ##### Test Part Three ###############
    # LogisticRegressionLayer
    #################################

    # print "############# LogisticRegressionLayer ##############"

    l_final = LogisticRegression(
        rng, 
        input = dropout(l2.output.flatten(2), p=dropout_rate2),
        n_in = feat_map_n_final * k_top * EMB_DIM,
        # n_in = feat_map_n * k * EMB_DIM / 2, # we fold once, so divide by 2
        n_out = n_class, # five sentiment level
    )

    l_final_no_dropout = LogisticRegression(
        rng, 
        input = l2_no_dropout.output.flatten(2),
        W = l_final.W * (1 - dropout_rate2),
        b = l_final.b,
        n_in = feat_map_n_final * k_top * EMB_DIM,
        # n_in = feat_map_n * k * EMB_DIM / 2, # we fold once, so divide by 2
        n_out = n_class, # five sentiment level
    )


    print "n_in : ", feat_map_n_final * k_top * EMB_DIM
    # print "n_in = %d" %(2 * 2 * math.ceil(EMB_DIM / 2.))


    # p_y_given_x = theano.function(
    #     inputs = [x, length_x],
    #     outputs = l_final.p_y_given_x,
    #     allow_input_downcast=True,
    #     # mode = "DebugMode"
    # )

    # print "p_y_given_x = "
    # print p_y_given_x(
    #     np.array([input_x], dtype=np.int32),
    #     len(input_x)
    # )

    cost = theano.function(
        inputs = [x, length_x, y],
        outputs = l_final.nnl(y),
        allow_input_downcast=True,
        # mode = "DebugMode"
    )

    # print "cost:\n", cost(
    #     np.array([input_x], dtype = np.int32), 
    #     len(input_x),
    #     np.array([1], dtype = np.int32)
    # )

    
    print "############# Learning ##############"

    from sgd import sgd, rmsprop, adagrad, adadelta, adam
    from regularizer import regularize_l2

    layers = []
    layers.append(embeddings)
    layers.append(l1)
    layers.append(l2)
    layers.append(l_final)


    cost = l_final.nnl(y)
    params = [p for layer in layers for p in layer.params]
    param_shapes = [l.param_shapes for l in layers]
    param_grads = [T.grad(cost, param) for param in params]

    # regularizer setting
    regularizers = {}
    regularizers['c'] = regular_c # 2.0, 4.0, 15.0
    regularizers['func'] = [None for _ in range(len(params))]
    if use_regular:
        regularizers_func = []
        regularizers_func.append([regularize_l2(l=0.0001)]) # [embeddings]
        regularizers_func.append([regularize_l2(l=0.00003), None]) # [W, b]
        regularizers_func.append([regularize_l2(l=0.000003), None]) # [W, b]
        regularizers_func.append([regularize_l2(l=0.0001), None]) # [logreg_W, logreg_b]
        regularizers_func = [r_func for r in regularizers_func for r_func in r]
        regularizers['func'] = regularizers_func

    # if third conv layer: 1e-5
    
    print embeddings.params
    print l1.params
    print l2.params
    print l_final.params




    # updates = sgd(cost, l_final.params)
    # RegE = 1e-4
    # print param_grads
    if learn == "sgd":
        updates = sgd(cost, params, lr=0.05)
    elif learn == "adam":
        updates = adam(loss_or_grads=cost, params=params, learning_rate=alpha, regularizers=regularizers)
    elif learn == "adagrad":
        updates = adagrad(loss_or_grads=cost, params=params, learning_rate=alpha, regularizers=regularizers)
    elif learn == "adadelta":
        updates = adadelta(loss_or_grads=cost, params=params, regularizers=regularizers)
    elif learn == "rmsprop":
        updates = rmsprop(loss_or_grads=cost, params=params, learning_rate=alpha, regularizers=regularizers)


    train = theano.function(inputs=[x, length_x, y], outputs=cost, updates=updates, allow_input_downcast=True)
    # predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)
    predict = theano.function(
        inputs = [x, length_x],
        outputs = T.argmax(l_final_no_dropout.p_y_given_x, axis=1),
        allow_input_downcast=True,
        # mode = "DebugMode"
    )




    def b(x_data):
        return np.array(x_data, dtype=np.int32)


    def test(test_set):
        # print "############# TEST ##############"
        y_pred = []
        test_set_y = []
        # for train_x, train_y in zip(X_data, Y_data):
        # print test_set
        # Accuracy_count = 0
        for test_y,test_x in test_set:
            test_x = b([test_x])
            p = predict(test_x, len(test_x))[0]
            y_pred.append(p)
            test_set_y.append(test_y)

            # if test_y == p:
            #     Accuracy_count += 1

            # print "*predict :",predict(train_x, len(train_x)), train_y 
        # Accuracy = float(Accuracy_count) / len(test_set)
        # print "  accuracy : %f" % Accuracy, 
        return accuracy_score(test_set_y, y_pred)
        # print classification_report(test_set_y, y_pred)

    # train_set_rand = np.ndarray(train_set)
    train_set_rand = train_set[:]
    train_cost_sum = 0.0
    for epoch in xrange(n_epoch):
        print "== epoch : %d =="  % epoch
        if shuffle_flag:
            np.random.shuffle(train_set_rand)
            # train_set_rand = np.random.permutation(train_set)
        for i,x_y_set in enumerate(train_set_rand):
            train_y, train_x = x_y_set
            train_x = b([train_x])
            train_y = b([train_y])

            train_cost = train(train_x, len(train_x) , train_y)
            train_cost_sum += train_cost
            if i % 1000 == 0 or i == len(train_set)-1:
                print "i : (%d/%d)" % (i, len(train_set)) , 
                print " (cost : %f )" % train_cost
        
        print '  cost :', train_cost_sum
        print '  train_set : %f' % test(train_set)
        print '  dev_set   : %f' % test(dev_set)
        print '  test_set  : %f' % test(test_set)





    '''
Beispiel #12
0
#coding:utf-8

import numpy as np
from scipy import io
from sklearn.externals import joblib
from sklearn.metrics import precision_score, recall_score
from logreg import LogisticRegression
import matplotlib.pyplot as plt

if __name__ == "__main__":
    X_train = io.loadmat("X_train")["X_train"]
    X_train = X_train.tocsr()  #疎行列の種類の変更(tfidfVectorizerで出力されるものと同じものにする)
    y_train = np.load("y_train.npy")
    clf = LogisticRegression("logreg")

    #thresholdに応じたprecisionとrecallの変化をプロット
    threshold_list = [i * 0.05 for i in range(20)]
    precision_list = []
    recall_list = []
    for threshold in threshold_list:
        y_predict = clf.predict(X_train, threshold)
        precision_list.append(precision_score(y_train, y_predict))
        recall_list.append(recall_score(y_train, y_predict))

    plt.plot(threshold_list, precision_list, label="precision", color="red")
    plt.plot(threshold_list, recall_list, label="recall", color="blue")

    plt.xlabel("threshold")
    plt.ylabel("rate")
    plt.xlim(0.0, 1.0)
    plt.ylim(0, 1)
Beispiel #13
0
import time

import numpy as np
from scipy import io
from sklearn.externals import joblib
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from logreg import LogisticRegression

if __name__ == "__main__":
    X_train = io.loadmat("X_train")["X_train"]
    X_train = X_train.tocsr()  #疎行列の種類の変更(tfidfVectorizerで出力されるものと同じものにする)
    y_train = np.load("y_train.npy")
    kf = KFold(n_splits=5)

    start = time.time()
    for (i, (train, test)) in enumerate(kf.split(X_train), start=1):
        clf = LogisticRegression()
        clf.fit(X_train[train], y_train[train])
        y_predict = clf.predict(X_train[test])
        y_test = y_train[test]
        print("Fold %d" % i)
        print("正解率: %f" % accuracy_score(y_test, y_predict))
        print("適合率: %f" % precision_score(y_test, y_predict))
        print("再現率: %f" % recall_score(y_test, y_predict))
        print("F1スコア: %f" % f1_score(y_test, y_predict))
        print("")
    elapsed_time = time.time() - start
    print(str(elapsed_time) + "[sec]")
Beispiel #14
0
 filename = 'data/data2.dat'
 data = loadtxt(filename, delimiter=',')
 X = data[:, 0:2]
 y = np.array([data[:, 2]]).T
 n,d = X.shape
 
 # Standardize the data
 mean = X.mean(axis=0)
 std = X.std(axis=0)
 X = (X - mean) / std
 
 # map features into a higher dimensional feature space
 X = mapFeature(X[:,0],X[:,1])
 
 # train logistic regression
 logregModel = LogisticRegression()
 logregModel.fit(X,y)
 
 
 # reload the data for 2D plotting purposes
 data = loadtxt(filename, delimiter=',')
 PX = data[:, 0:2]
 y = data[:, 2]
 
 # Standardize the data
 mean = PX.mean(axis=0)
 std = PX.std(axis=0)
 PX = (PX - mean) / std
 
 # Plot the decision boundary
 h = .02  # step size in the mesh
Beispiel #15
0
    def __init__(self, rng, input, n_in, n_hidden, n_out):
        """Initialize the parameters for the multilayer perceptron

        :type rng: numpy.random.RandomState
        :param rng: a random number generator used to initialize weights

        :type input: theano.tensor.TensorType
        :param input: symbolic variable that describes the input of the
        architecture (one minibatch)

        :type n_in: int
        :param n_in: number of input units, the dimension of the space in
        which the datapoints lie

        :type n_hidden: int
        :param n_hidden: number of hidden units

        :type n_out: int
        :param n_out: number of output units, the dimension of the space in
        which the labels lie

        """

        # Since we are dealing with a one hidden layer MLP, this will translate
        # into a HiddenLayer with a tanh activation function connected to the
        # LogisticRegression layer; the activation function can be replaced by
        # sigmoid or any other nonlinear function
        self.hiddenLayer = HiddenLayer(rng=rng,
                                       input=input,
                                       n_in=n_in,
                                       n_out=n_hidden,
                                       activation=T.tanh)

        # The logistic regression layer gets as input the hidden units
        # of the hidden layer
        self.logRegressionLayer = LogisticRegression(
            input=self.hiddenLayer.output, n_in=n_hidden, n_out=n_out)
        # end-snippet-2 start-snippet-3
        # L1 norm ; one regularization option is to enforce L1 norm to
        # be small
        self.L1 = (abs(self.hiddenLayer.W).sum() +
                   abs(self.logRegressionLayer.W).sum())

        # square of L2 norm ; one regularization option is to enforce
        # square of L2 norm to be small
        self.L2_sqr = ((self.hiddenLayer.W**2).sum() +
                       (self.logRegressionLayer.W**2).sum())

        # negative log likelihood of the MLP is given by the negative
        # log likelihood of the output of the model, computed in the
        # logistic regression layer
        self.negative_log_likelihood = (
            self.logRegressionLayer.negative_log_likelihood)
        # same holds for the function computing the number of errors
        self.errors = self.logRegressionLayer.errors

        # the parameters of the model are the parameters of the two layer it is
        # made out of
        self.params = self.hiddenLayer.params + self.logRegressionLayer.params
        # end-snippet-3

        # keep track of model input
        self.input = input
Beispiel #16
0
def evaluate_lenet5(learning_rate=0.1, n_epochs=200,
                    dataset='mnist.pkl.gz',
                    nkerns=[20, 50], batch_size=500):
    """ Demonstrates lenet on MNIST dataset

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: path to the dataset used for training /testing (MNIST here)

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """

    rng = numpy.random.RandomState(23455)

    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    n_test_batches /= batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # start-snippet-1
    x = T.matrix('x')   # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
                        # [int] labels

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size, 28 * 28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    # (28, 28) is the size of MNIST images.
    layer0_input = x.reshape((batch_size, 1, 28, 28))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24)
    # maxpooling reduces this further to (24/2, 24/2) = (12, 12)
    # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12)
    layer0 = LeNetConvPoolLayer(
        rng,
        input=layer0_input,
        image_shape=(batch_size, 1, 28, 28),
        filter_shape=(nkerns[0], 1, 5, 5),
        poolsize=(2, 2)
    )

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8)
    # maxpooling reduces this further to (8/2, 8/2) = (4, 4)
    # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4)
    layer1 = LeNetConvPoolLayer(
        rng,
        input=layer0.output,
        image_shape=(batch_size, nkerns[0], 12, 12),
        filter_shape=(nkerns[1], nkerns[0], 5, 5),
        poolsize=(2, 2)
    )

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4),
    # or (500, 50 * 4 * 4) = (500, 800) with the default values.
    layer2_input = layer1.output.flatten(2)

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(
        rng,
        input=layer2_input,
        n_in=nkerns[1] * 4 * 4,
        n_out=500,
        activation=T.tanh
    )

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10)

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)

    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: test_set_x[index * batch_size: (index + 1) * batch_size],
            y: test_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    validate_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: valid_set_x[index * batch_size: (index + 1) * batch_size],
            y: valid_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer2.params + layer1.params + layer0.params

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i], grads[i]) pairs.
    updates = [
        (param_i, param_i - learning_rate * grad_i)
        for param_i, grad_i in zip(params, grads)
    ]

    train_model = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )
    # end-snippet-1

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = timeit.default_timer()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                print 'training @ iter = ', iter
            cost_ij = train_model(minibatch_index)

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [validate_model(i) for i
                                     in xrange(n_valid_batches)]
                this_validation_loss = numpy.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [
                        test_model(i)
                        for i in xrange(n_test_batches)
                    ]
                    test_score = numpy.mean(test_losses)
                    print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

            if patience <= iter:
                done_looping = True
                break

    end_time = timeit.default_timer()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i, '
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Beispiel #17
0
def main(train_path, valid_path, test_path, save_path):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on t-labels,
        2. on y-labels,
        3. on y-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        save_path: Path to save predictions.
    """
    output_path_true = save_path.replace(WILDCARD, 'true')
    output_path_naive = save_path.replace(WILDCARD, 'naive')
    output_path_adjusted = save_path.replace(WILDCARD, 'adjusted')

    # *** START CODE HERE ***

    def image_path(path):
        return path[:-3] + "png"

    # Part (a): Train and test on true labels
    # Make sure to save predicted probabilities to output_path_true using np.savetxt()
    x_train, t_train = util.load_dataset(train_path,
                                         label_col="t",
                                         add_intercept=True)
    x_test, t_test = util.load_dataset(test_path,
                                       label_col="t",
                                       add_intercept=True)
    model = LogisticRegression()
    model.fit(x_train, t_train)

    prob_test = model.predict(x_test)
    np.savetxt(output_path_true, prob_test)
    util.plot(x_test,
              t_test,
              model.theta,
              save_path=image_path(output_path_true))

    # Part (b): Train on y-labels and test on true labels
    # Make sure to save predicted probabilities to output_path_naive using np.savetxt()
    x_train, y_train = util.load_dataset(train_path,
                                         label_col="y",
                                         add_intercept=True)
    x_test, y_test = util.load_dataset(test_path,
                                       label_col="y",
                                       add_intercept=True)

    model = LogisticRegression()
    model.fit(x_train, y_train)

    prob_test = model.predict(x_test)
    np.savetxt(output_path_naive, prob_test)

    util.plot(x_test,
              t_test,
              model.theta,
              save_path=image_path(output_path_naive))
    # Part (f): Apply correction factor using validation set and test on true labels
    # Plot and use np.savetxt to save outputs to output_path_adjusted
    # Estimate alpha
    x_val, y_val = util.load_dataset(valid_path,
                                     label_col="y",
                                     add_intercept=True)
    model = LogisticRegression()
    model.fit(x_train, y_train)
    h_val = model.predict(x_val)
    alpha = np.mean(h_val[y_val == 1])  # Mean over positive y samples.
    # Adjustment
    py_test = model.predict(x_test)
    pt_test = py_test / alpha
    np.savetxt(output_path_adjusted, pt_test)
    # Plot
    util.plot(x_test,
              t_test,
              model.theta,
              save_path=image_path(output_path_adjusted),
              correction=alpha)
Beispiel #18
0
x = np.random.rand(3, 10)
y = np.asarray(np.random.randint(5, size=3), dtype=np.int32)

np_l = LogisticRegression(W, b)

#########################
# THEANO PART
#########################

x_symbol = theano.tensor.dmatrix('x')
y_symbol = theano.tensor.ivector('y')

th_l = TheanoLogisticRegression(rng=np.random.RandomState(1234),
                                input=x_symbol,
                                n_in=10,
                                n_out=5,
                                W=theano.shared(value=W, name="W"),
                                b=theano.shared(value=b, name="b"))

f1 = theano.function(inputs=[x_symbol, y_symbol], outputs=th_l.nnl(y_symbol))

actual = np_l.nnl(x, y)
expected = f1(x, y)

assert_matrix_eq(actual, expected, "nnl")

f2 = theano.function(inputs=[x_symbol, y_symbol],
                     outputs=th_l.errors(y_symbol))

actual = np_l.errors(x, y)
expected = f2(x, y)
Beispiel #19
0
def main(train_path, valid_path, test_path, save_path):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on t-labels,
        2. on y-labels,
        3. on y-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        save_path: Path to save predictions.
    """
    output_path_true = save_path.replace(WILDCARD, 'true')
    output_path_naive = save_path.replace(WILDCARD, 'naive')
    output_path_adjusted = save_path.replace(WILDCARD, 'adjusted')

    # *** START CODE HERE ***
    # Part (a): Train and test on true labels
    x_train, t_train = util.load_dataset(train_path,
                                         label_col='t',
                                         add_intercept=True)
    model = LogisticRegression()
    model.fit(x_train, t_train)
    x_test, t_test = util.load_dataset(test_path,
                                       label_col='t',
                                       add_intercept=True)
    t_pred = model.predict(x_test)
    util.plot(x_test, t_test, model.theta, '{}.png'.format(output_path_true))
    np.savetxt(output_path_true, t_pred)
    # Make sure to save predicted probabilities to output_path_true using np.savetxt()
    # Part (b): Train on y-labels and test on true labels
    x_train, y_train = util.load_dataset(train_path,
                                         label_col='y',
                                         add_intercept=True)
    model = LogisticRegression()
    model.fit(x_train, y_train)
    x_test, t_test = util.load_dataset(test_path,
                                       label_col='t',
                                       add_intercept=True)
    t_pred = model.predict(x_test)
    util.plot(x_test, t_test, model.theta, '{}.png'.format(output_path_naive))
    np.savetxt(output_path_naive, t_pred)
    # Make sure to save predicted probabilities to output_path_naive using np.savetxt()
    # Part (f): Apply correction factor using validation set and test on true labels
    x_val, y_val = util.load_dataset(valid_path,
                                     label_col='y',
                                     add_intercept=True)
    h_val = model.predict(x_val)
    alpha = np.mean(h_val[y_val == 1])
    py_test = model.predict(x_test)
    pt_test = py_test / alpha
    util.plot(x_test,
              t_test,
              model.theta,
              '{}.png'.format(output_path_adjusted),
              correction=alpha)
    np.savetxt(output_path_adjusted, pt_test)
Beispiel #20
0
    filename = 'data/data2.dat'
    data = loadtxt(filename, delimiter=',')
    X = data[:, 0:2]
    y = np.array([data[:, 2]]).T
    n, d = X.shape

    # Standardize the data
    mean = X.mean(axis=0)
    std = X.std(axis=0)
    X = (X - mean) / std

    # map features into a higher dimensional feature space
    X = mapFeature(X[:, 0], X[:, 1])

    # train logistic regression
    logregModel = LogisticRegression(regLambda=10)
    logregModel.fit(X, y)

    # reload the data for 2D plotting purposes
    data = loadtxt(filename, delimiter=',')
    PX = data[:, 0:2]
    y = data[:, 2]

    # Standardize the data
    mean = PX.mean(axis=0)
    std = PX.std(axis=0)
    PX = (PX - mean) / std

    # Plot the decision boundary
    h = .02  # step size in the mesh
    x_min, x_max = PX[:, 0].min() - .5, PX[:, 0].max() + .5
)

np_l = LogisticRegression(W, b)

#########################
# THEANO PART
#########################

x_symbol = theano.tensor.dmatrix('x')
y_symbol = theano.tensor.ivector('y')

th_l = TheanoLogisticRegression(rng = np.random.RandomState(1234), 
                                input = x_symbol, 
                                n_in = 10, 
                                n_out = 5,
                                W = theano.shared(value = W, 
                                                  name = "W"), 
                                b = theano.shared(value = b, 
                                                  name = "b")
)

f1 = theano.function(inputs = [x_symbol, y_symbol], 
                     outputs = th_l.nnl(y_symbol)
                 )

actual = np_l.nnl(x, y)
expected = f1(x, y)


assert_matrix_eq(actual, expected, "nnl")
Beispiel #22
0
if __name__ == "__main__":
    # Load Data
    filename = 'data/data1.dat'
    data = loadtxt(filename, delimiter=',')
    X = data[:, 0:2]
    y = np.array([data[:, 2]]).T
    n,d = X.shape

    # Standardize the data
    mean = X.mean(axis=0)
    std = X.std(axis=0)
    X = (X - mean) / std

    # train logistic regression
    logregModel = LogisticRegression(regLambda = 0.00000001)
    logregModel.fit(X,y)

    # Plot the decision boundary
    h = .02  # step size in the mesh
    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    Z = logregModel.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure(1, figsize=(4, 3))
    plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)

    # Plot the training points
Beispiel #23
0
#coding:utf-8

import numpy as np
from scipy import io
from sklearn.externals import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from logreg import LogisticRegression

if __name__ == "__main__":
    X_train = io.loadmat("X_train")["X_train"]
    X_train = X_train.tocsr()  #疎行列の種類の変更(tfidfVectorizerで出力されるものと同じものにする)
    y_train = np.load("y_train.npy")
    clf = LogisticRegression("logreg")
    y_predict = clf.predict(X_train)
    print("正解率: %f" % accuracy_score(y_train, y_predict))
    print("適合率: %f" % precision_score(y_train, y_predict))
    print("再現率: %f" % recall_score(y_train, y_predict))
    print("F1スコア: %f" % f1_score(y_train, y_predict))
Beispiel #24
0
print out
print out.shape

expected = (1, feat_map_n, EMB_DIM / 2, k)
assert out.shape == expected, "%r != %r" % (out.shape, expected)

##### Test Part Three ###############
# LogisticRegressionLayer
#################################

print "############# LogisticRegressionLayer ##############"

l3 = LogisticRegression(
    rng,
    input=l2.output.flatten(2),
    n_in=feat_map_n * k * EMB_DIM / 2,  # we fold once, so divide by 2
    n_out=5  # five sentiment level
)

print "n_in = %d" % (2 * 2 * math.ceil(EMB_DIM / 2.))

y = T.ivector('y')  # the sentence sentiment label

p_y_given_x = theano.function(inputs=[x],
                              outputs=l3.p_y_given_x,
                              mode="DebugMode")

print "p_y_given_x = "
print p_y_given_x(np.array([[1, 3, 4, 5], [0, 1, 4, 7]], dtype=np.int32))

cost = theano.function(inputs=[x, y], outputs=l3.nnl(y), mode="DebugMode")
Beispiel #25
0
def main(train_path, valid_path, test_path, save_path):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on t-labels,
        2. on y-labels,
        3. on y-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        save_path: Path to save predictions.
    """
    output_path_true = save_path.replace(WILDCARD, 'true')
    output_path_naive = save_path.replace(WILDCARD, 'naive')
    output_path_adjusted = save_path.replace(WILDCARD, 'adjusted')

    # *** START CODE HERE ***
    # Part (a): Train and test on true labels
    x_train, y_train = util.load_dataset(train_path,
                                         label_col='t',
                                         add_intercept=True)

    model_true = LogisticRegression()
    model_true.fit(x_train, y_train)

    x_test, y_test = util.load_dataset(test_path,
                                       label_col='t',
                                       add_intercept=True)

    util.plot(x_test, y_test, model_true.theta, 'plot_5a.png')

    # Make sure to save predicted probabilities to output_path_true using np.savetxt()
    np.savetxt(output_path_true, model_true.predict(x_test))

    # Part (b): Train on y-labels and test on true labels
    x_train, y_train = util.load_dataset(train_path,
                                         label_col='y',
                                         add_intercept=True)

    model_naive = LogisticRegression()
    model_naive.fit(x_train, y_train)

    x_test, y_test = util.load_dataset(test_path,
                                       label_col='y',
                                       add_intercept=True)
    util.plot(x_test, y_test, model_naive.theta, 'plot_5b.png')

    # Make sure to save predicted probabilities to output_path_naive using np.savetxt()
    np.savetxt(output_path_naive, model_naive.predict(x_test))

    # Part (f): Apply correction factor using validation set and test on true labels
    x_valid, y_valid = util.load_dataset(valid_path,
                                         label_col='t',
                                         add_intercept=True)

    x_index = np.where(y_valid == 1)

    alpha = 1 / len(y_valid[y_valid == 1]) * np.sum(
        model_naive.predict((x_valid[x_index])))

    x_test, y_test = util.load_dataset(test_path,
                                       label_col='y',
                                       add_intercept=True)

    util.plot(x_test,
              y_test,
              model_naive.theta,
              'plot_5f.png',
              correction=alpha)

    np.savetxt(output_path_adjusted, model_naive.predict(x_test) * alpha)
Beispiel #26
0
    def __init__(self, x, y, vocab_size, embed_dim, label_n):
        """
        x: theano.tensor.imatrix, (minibatch size, 3)
            the tree matrix of the minibatch
            for each row, (node id, left child id, right child id)

        y: theano.tensor.ivector, (minibatch size,)
            the labels

        vocab_size: int
            vocabulary size, including both the words and phrases
        
        embed_dim: int
            the embedding dimension

        """
        assert x.ndim == 2
        assert y.ndim == 1

        parent_ids = x[:, 0]
        children_ids = x[:, 1:]

        rng = np.random.RandomState(1234)

        self.embedding = theano.shared(
            value=rng.normal(0, 0.05, (vocab_size, embed_dim)),
            name='embedding',
            borrow=True,
        )

        self.rntn_layer = RNTNLayer(rng, embed_dim)

        # Update the embedding by
        # forwarding the embedding from bottom to up
        # and getting the vector for each node in each tree

        def update_embedding(child_indices, my_index, embedding):

            assert child_indices.ndim == 1
            assert my_index.ndim == 0

            return T.switch(
                T.eq(
                    child_indices[0], -1
                ),  # NOTE: not using all() because it's non-differentiable
                embedding,  # if no child, return the word embedding
                T.set_subtensor(
                    embedding[
                        my_index],  # otherwise, compute the embedding of RNTN layer
                    self.rntn_layer.output(embedding[child_indices[0]],
                                           embedding[child_indices[1]])))

        final_embedding, updates = theano.scan(
            fn=update_embedding,
            sequences=[children_ids, parent_ids],
            outputs_info=self.
            embedding,  # we should pass the whole matrix and fill in the positions if necessary
        )

        self.update_embedding = theano.function(
            inputs=[x],
            updates=[(self.embedding,
                      T.set_subtensor(self.embedding[parent_ids],
                                      final_embedding[-1][parent_ids]))])

        # the logistic regression layer that predicts the label
        self.logreg_layer = LogisticRegression(
            rng,
            input=final_embedding[-1][parent_ids],
            n_in=embed_dim,
            n_out=label_n)

        cost = self.logreg_layer.nnl(y)

        params = self.logreg_layer.params + self.rntn_layer.params + [
            self.embedding
        ]
        self.params = params

        param_shapes = self.logreg_layer.param_shapes + self.rntn_layer.param_shapes + [
            (vocab_size, embed_dim)
        ]

        grads = [T.grad(cost=cost, wrt=p) for p in params]

        updates = build_adadelta_updates(params,
                                         param_shapes,
                                         grads,
                                         epsilon=0.1)

        # TODO: in this step, forward propagation is done again besides the one in `update_embedding`
        #       this extra computation should be avoided
        self.train = theano.function(inputs=[x, y], updates=updates)
Beispiel #27
0
    def __init__(self, x, y, vocab_size, embed_dim, label_n):
        """
        x: theano.tensor.imatrix, (minibatch size, 3)
            the tree matrix of the minibatch
            for each row, (node id, left child id, right child id)

        y: theano.tensor.ivector, (minibatch size,)
            the labels

        vocab_size: int
            vocabulary size, including both the words and phrases
        
        embed_dim: int
            the embedding dimension

        """
        assert x.ndim == 2
        assert y.ndim == 1
        
        parent_ids = x[:,0]
        children_ids = x[:,1:]
        
        rng = np.random.RandomState(1234)     

        self.embedding = theano.shared(
            value = rng.normal(0, 0.05, (vocab_size, embed_dim)),
            name = 'embedding',
            borrow = True,
        )        
        
        self.rntn_layer = RNTNLayer(rng, embed_dim)

        # Update the embedding by
        # forwarding the embedding from bottom to up
        # and getting the vector for each node in each tree
        
        def update_embedding(child_indices, my_index, embedding):

            assert child_indices.ndim == 1
            assert my_index.ndim == 0

            return T.switch(T.eq(child_indices[0], -1), # NOTE: not using all() because it's non-differentiable
                            embedding, # if no child, return the word embedding
                            T.set_subtensor(embedding[my_index], # otherwise, compute the embedding of RNTN layer
                                            self.rntn_layer.output(embedding[child_indices[0]], 
                                                                   embedding[child_indices[1]])
                                        )
            )
            
        final_embedding, updates = theano.scan(
            fn = update_embedding, 
            sequences = [children_ids, parent_ids],
            outputs_info = self.embedding, # we should pass the whole matrix and fill in the positions if necessary
        )
                

        self.update_embedding = theano.function(inputs = [x], 
                                                updates = [(self.embedding, 
                                                            T.set_subtensor(self.embedding[parent_ids], final_embedding[-1][parent_ids]))])

        # the logistic regression layer that predicts the label
        self.logreg_layer = LogisticRegression(rng, 
                                          input = final_embedding[-1][parent_ids], 
                                          n_in = embed_dim,
                                          n_out = label_n
        )
        
        cost = self.logreg_layer.nnl(y)

        params = self.logreg_layer.params + self.rntn_layer.params + [self.embedding]
        self.params = params

        param_shapes = self.logreg_layer.param_shapes + self.rntn_layer.param_shapes + [(vocab_size, embed_dim)]
        
        grads = [T.grad(cost = cost, wrt=p) for p in params]
        
        updates = build_adadelta_updates(params, param_shapes, grads, epsilon = 0.1)
        
        # TODO: in this step, forward propagation is done again besides the one in `update_embedding`
        #       this extra computation should be avoided
        self.train = theano.function(inputs = [x, y], 
                                     updates = updates)
                              fold = 1,
                              W = theano.shared(value = W, name = "W"),
                              b = theano.shared(value = b, name = "b")
)

n_in = filter_shape[0] * k * embed_dm / 2
n_out = 5
W_logreg = np.asarray(np.random.rand(n_in, n_out), 
                      dtype = theano.config.floatX)
b_logreg = np.asarray(np.random.rand(n_out),
                      dtype = theano.config.floatX)

layer3 = LogisticRegression(rng = rng, 
                            input = layer2.output.flatten(2), 
                            n_in = n_in, 
                            n_out = n_out,
                            W = theano.shared(value = W_logreg, name = "W_logreg"),
                            b = theano.shared(value = b_logreg, name = "b_logreg")
)

f1 = theano.function(inputs = [x_symbol, y_symbol], 
                     outputs = layer3.nnl(y_symbol)
)

f2 = theano.function(inputs = [x_symbol, y_symbol], 
                     outputs = layer3.errors(y_symbol)
)

f3 = theano.function(inputs = [x_symbol], 
                     outputs = layer3.p_y_given_x
)
Beispiel #29
0
#coding:utf-8

from sklearn.externals import joblib
from logreg import LogisticRegression

ENCODING = "cp1252"

if __name__ == "__main__":

    vectorizer = joblib.load("tfidf.vec")
    clf = LogisticRegression("logreg")
    terms = vectorizer.get_feature_names()
    index_list = list(range(len(terms)))
    index_list.sort(key=lambda i: clf.coef_[i])

    print("top 10")
    for i in index_list[:-11:-1]:
        print(terms[i], clf.coef_[i])

    print("")

    print("worst 10")
    for i in index_list[:10]:
        print(terms[i], clf.coef_[i])
    # 1) Replace the `create_dataset` function from dep_parser_fix.py to your dep_parser.py file
    # 2) Replace parse_dataset.py with the given new version
    #

    # Create parser
    p = Parser()

    # Create training dataset
    ds = p.create_dataset("en-ud-train-projective.conllu", train=True)

    # Train LR model
    if os.path.exists('model.pkl'):
        # if model exists, load from file
        print("Loading existing model...")
        lr = pickle.load(open('model.pkl', 'rb'))
    else:
        # train model using minibatch GD
        lr = LogisticRegression()
        lr.fit(*ds.to_arrays())
        pickle.dump(lr, open('model.pkl', 'wb'))

    # Create test dataset
    test_ds = p.create_dataset("en-ud-dev.conllu")
    # Copy feature maps to ensure that test datapoints are encoded in the same way
    test_ds.copy_feature_maps(ds)
    # Compute move-level accuracy
    lr.classify_datapoints(*test_ds.to_arrays())

    # Compute UAS and sentence-level accuracy
    t = TreeConstructor(p)
    t.evaluate(lr, 'en-ud-dev.conllu', ds)
Beispiel #31
0
layer2 = ConvFoldingPoolLayer(rng=rng,
                              input=layer1.output,
                              filter_shape=filter_shape,
                              k=k,
                              fold=1,
                              W=theano.shared(value=W, name="W"),
                              b=theano.shared(value=b, name="b"))

n_in = filter_shape[0] * k * embed_dm / 2
n_out = 5
W_logreg = np.asarray(np.random.rand(n_in, n_out), dtype=theano.config.floatX)
b_logreg = np.asarray(np.random.rand(n_out), dtype=theano.config.floatX)

layer3 = LogisticRegression(rng=rng,
                            input=layer2.output.flatten(2),
                            n_in=n_in,
                            n_out=n_out,
                            W=theano.shared(value=W_logreg, name="W_logreg"),
                            b=theano.shared(value=b_logreg, name="b_logreg"))

f1 = theano.function(inputs=[x_symbol, y_symbol], outputs=layer3.nnl(y_symbol))

f2 = theano.function(inputs=[x_symbol, y_symbol],
                     outputs=layer3.errors(y_symbol))

f3 = theano.function(inputs=[x_symbol], outputs=layer3.p_y_given_x)

f_el = theano.function(inputs=[x_symbol], outputs=layer1.output)

f_cl = theano.function(inputs=[x_symbol], outputs=layer2.output)

#########################
Beispiel #32
0
def main(train_path, valid_path, test_path, save_path):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on t-labels,
        2. on y-labels,
        3. on y-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        save_path: Path to save predictions.
    """
    output_path_true = save_path.replace(WILDCARD, 'true')
    output_path_naive = save_path.replace(WILDCARD, 'naive')
    output_path_adjusted = save_path.replace(WILDCARD, 'adjusted')

    # Part (a):
    x_train, t_train = util.load_dataset(train_path, 't', add_intercept=True)
    x_test, t_test = util.load_dataset(test_path, 't', add_intercept=True)
    clf = LogisticRegression()
    clf.fit(x_train, t_train)
    util.plot(x_test, t_test, clf.theta, 'posonly-true.jpg')
    np.savetxt(output_path_true, clf.predict(x_test))

    # Part (b):
    x_train, y_train = util.load_dataset(train_path, add_intercept=True)
    x_test, y_test = util.load_dataset(test_path, add_intercept=True)
    x_valid, y_valid = util.load_dataset(valid_path, add_intercept=True)
    clf = LogisticRegression()
    clf.fit(x_train, y_train)
    util.plot(x_test, t_test, clf.theta, 'posonly-naive.jpg')
    np.savetxt(output_path_naive, clf.predict(x_test))

    # Part (f):
    alpha = np.mean(clf.predict(x_valid[y_valid == 1]))
    np.savetxt(output_path_adjusted, clf.predict(x_test) / alpha)
    clf.theta[0] += np.log(2 / alpha - 1)
    util.plot(x_test, t_test, clf.theta, 'posonly_adjusted.jpg')
Beispiel #33
0
def evaluatePerformance(numTrials = 1000):
    '''
    Evaluate the performance of decision trees and logistic regression,
    average over 1,000 trials of 10-fold cross validation
    
    Return:
      a matrix giving the performance that will contain the following entries:
      stats[0,0] = mean accuracy of decision tree
      stats[0,1] = std deviation of decision tree accuracy
      stats[1,0] = mean accuracy of logistic regression
      stats[1,1] = std deviation of logistic regression accuracy
      
    ** Note that your implementation must follow this API**
    '''
    
       # Xtrain = X[1:101,:]  # train on first 100 instances
       #  Xtest = X[101:,:]
       #  ytrain = y[1:101,:]  # test on remaining instances
       #  ytest = y[101:,:]

    # Load Data
    filename = 'data/SPECTF.dat'
    data = np.loadtxt(filename, delimiter=',')
    X = data[:, 1:]
    y = np.array([data[:, 0]]).T
    n,d = X.shape

    # shuffle the data
    idx = np.arange(n)
    np.random.seed(13)
    # number of folds
    k = 10 
    # creates an array of numbers that correspond to the start / end points of each fold in the case for hw from 0 -266  it should return 0 26 ...267
    fold_index = n/k 
    index_arrayX =  [i*fold_index for i in range(k)]
    index_arrayX = np.append(index_arrayX,n)
    index_arrayY = [i*fold_index for i in range(k)] 
    index_arrayY = np.append(index_arrayX,n)

    stddevLogisticRegressionAccuracy = 0
    meanDecisionTreeAccuracy = 0
    meanLogisticRegressionAccuracy = 0 
    stddevDecisionTreeAccuracy = 0
    # an array to store all of the learning accuracies  where the #rows = k*numTrial and # columns is each percentage of the data 
    log_learning = np.matrix(np.zeros((numTrials*k,9)))
    tree_learning = np.matrix(np.zeros((numTrials*k,9)))
    #index for learning 
    ll =0 
    #accuracy vars 
    log_a = 0
    tree_a =0

    # making decision tree object and a logistic regression object 

    clf = tree.DecisionTreeClassifier()
    lr = LogisticRegression(alpha = 0.0000001, regLambda=0.001, epsilon=0.0001, maxNumIters = 10000)

    #test_instance = 1
    #start_time = time.time()
    # ~~~~~~~~~~~main loop ~~~~~~~~~~~~~~~~~
    for i in xrange (numTrials): 
        #shuffle data after each cross validation 
        np.random.shuffle(idx)
        X = X[idx]
        y = y[idx]
        for j in xrange(k): 
          # seperate test data from train data, moves test data to subsequent fold after each loop
          #print (time.time() - start_time)
          end = j+1
          Xtest = X[index_arrayX[j]:index_arrayX[end],:]
          ytest = y[index_arrayY[j]:index_arrayX[end],:]
          Xtrain = X[0:index_arrayX[j],:]
          ytrain = y[0:index_arrayY[j],:]
          Xtrain = np.append(Xtrain, X[index_arrayX[j+1]:n,:],axis =0)
          ytrain = np.append(ytrain, y[index_arrayY[j+1]:n,:],axis =0)
          size_n,size_d = Xtrain.shape
          #size of 10% blocks 
          train_percentage = size_n/10
          for l in xrange(1,10):
            #train / find accuracy over 10% then 20% ect until loop exits 
            clf = clf.fit(Xtrain[0:train_percentage*l,:],ytrain[0:train_percentage*l,:])
            treey_pred = clf.predict(Xtest[0:train_percentage*l,:])
            lr.fit(Xtrain[0:train_percentage*l,:], ytrain[0:train_percentage*l,:])
            logy_pred = lr.predict(Xtest[0:train_percentage*l,:]) 
            # fill in accuracies into accuracy matrix  
            log_a =  accuracy_score(ytest[0:train_percentage*l,:],logy_pred) + log_a
            tree_a = accuracy_score(ytest[0:train_percentage*l,:],treey_pred) + tree_a
            log_learning[ll,(l-1)] = log_a
            tree_learning[ll,(l-1)] = tree_a
            ll+1
    tree_acc = 0
    log_acc = 0 
    for o in xrange(9):
      #summing the accuracies for each percentage then dviding by fold*trials * percentages
      meanDecisionTreeAccuracy = (np.sum(tree_learning[:,o])/(9*k*numTrials)) + meanDecisionTreeAccuracy
      meanLogisticRegressionAccuracy = (np.sum(log_learning[:,o])/(9*k*numTrials)) + meanLogisticRegressionAccuracy 

    #finding total mean accuracy over all percentages as well as standard deviations over (k*numTrial) trials
    meanDecisionTreeAccuracy = meanDecisionTreeAccuracy/(9)
    meanLogisticRegressionAccuracy = meanLogisticRegressionAccuracy /(9)
    stddevDecisionTreeAccuracy = np.std(tree_learning)/(k*numTrials)
    stddevLogisticRegressionAccuracy = np.std(log_learning)/(k*numTrials)


    # make certain that the return value matches the API specification
    stats = np.zeros((2,2))
    stats[0,0] = meanDecisionTreeAccuracy
    stats[0,1] = stddevDecisionTreeAccuracy
    stats[1,0] = meanLogisticRegressionAccuracy
    stats[1,1] = stddevLogisticRegressionAccuracy
    #end_time = time.time() 
    plot_log= np.array(np.zeros((9,1)))
    plot_tree =np.array(np.zeros((9,1)))
    #putting the mean accuracies for each perctage block into an array
    for q in xrange(9):
      plot_log[q] = np.sum(log_learning[:,q])/(9*k*numTrials)
      plot_tree[q] = np.sum(tree_learning[:,q])/(9*k*numTrials)
    percent_array = [10,20,30,40,50,60,70,80,90]

    plt.figure(1)
    plt.clf()
    plt.title("Learning Curve")
    plt.xlabel("Percentage")
    plt.ylabel("Accuracy")
    plt.axis([0,100, .6,.8])
    plt.plot(percent_array,plot_log, 'rx', label='Logistic Regression')
    plt.hold 
    plt.plot(percent_array,plot_tree, 'bx',label ='Decision Tree')
    plt.legend(loc='lower right')
    plt.savefig('learningcurve.png')
    #plt.show()
    

    return stats
print out
print out.shape

expected = (1, feat_map_n, EMB_DIM / 2, k)
assert out.shape == expected, "%r != %r" % (out.shape, expected)

##### Test Part Three ###############
# LogisticRegressionLayer
#################################

print "############# LogisticRegressionLayer ##############"

l3 = LogisticRegression(
    rng,
    input=l2.output.flatten(2),
    n_in=feat_map_n * k * EMB_DIM / 2,  # we fold once, so divide by 2
    n_out=5,  # five sentiment level
)

print "n_in = %d" % (2 * 2 * math.ceil(EMB_DIM / 2.0))

y = T.ivector("y")  # the sentence sentiment label

p_y_given_x = theano.function(inputs=[x], outputs=l3.p_y_given_x, mode="DebugMode")

print "p_y_given_x = "
print p_y_given_x(np.array([[1, 3, 4, 5], [0, 1, 4, 7]], dtype=np.int32))

cost = theano.function(inputs=[x, y], outputs=l3.nnl(y), mode="DebugMode")

print "cost:\n", cost(np.array([[1, 3, 4, 5], [0, 1, 4, 7]], dtype=np.int32), np.array([1, 2], dtype=np.int32))
Beispiel #35
0
def train_lenet5(train_set_x, train_set_y, params, batch_size, 
                 learning_rate=0.01, nkerns=[20,50], test=False):
    """ 
    Trains LeNet-5 on MNIST dataset, and returns trained parameters on
    completion.

    :type train_set: list of floats
    :param train_set: training samples (x- and y-values) for training

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient) 

    :type params: list of tuples of floats
    :param params: list of tuple of parameters from Supervisor. Takes the form
                   [(W_layer0, b_layer0), (W_layer1, b_layer1), 
                    (W_layer2, b_layer2), (W_layer3, b_layer3)]

    :type batch_size: int
    :param batch_size: size of training batch

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer

    Output: tuple of LeNet-5 parameters by layer, in this format:

    ( (W_layer0, b_layer0), ..., (W_layer3, b_layer3) )
    
    """

    rng = numpy.random.RandomState(23455)

    # compute number of minibatches for training, validation and testing
    n_train_batches = 100
    #n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()    # index to a [mini]batch 
    x     = T.matrix('x')  # the data is presented as rasterized images
    y     = T.ivector('y') # the labels are presented as 1D vector of 
                           # [int] labels
    ishape = (28,28)       # this is the size of MNIST images

    print '    ... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    layer0_input = x.reshape((batch_size,1,28,28))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1,28-5+1)=(24,24)
    # maxpooling reduces this further to (24/2,24/2) = (12,12)
    # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12)
    layer0 = LeNetConvPoolLayer(rng, input=layer0_input,
                                image_shape=(batch_size,1,28,28), 
                                W_values = params[0][0],
                                b_values = params[0][1],
                                filter_shape=(nkerns[0],1,5,5), poolsize=(2,2))

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1,12-5+1)=(8,8)
    # maxpooling reduces this further to (8/2,8/2) = (4,4)
    # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4)
    layer1 = LeNetConvPoolLayer(rng, input=layer0.output,
                                image_shape=(batch_size,nkerns[0],12,12), 
                                W_values = params[1][0],
                                b_values = params[1][1],
                                filter_shape=(nkerns[1],nkerns[0],5,5), 
                                poolsize=(2,2))

    # the TanhLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size,num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (20,32*4*4) = (20,512)
    layer2_input = layer1.output.flatten(2)

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1]*4*4, 
                         n_out=120, activation = T.tanh,
                         W_values = params[2][0],
                         b_values = params[2][1])

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer2.output, n_in=120, n_out=10,
                                W_values=params[3][0],
                                b_values=params[3][1])

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer2.params + layer1.params + layer0.params
    
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # train_model is a function that updates the model parameters by SGD
    # Since this model has many parameters, it would be tedious to manually
    # create an update rule for each model parameter. We thus create the updates
    # dictionary by automatically looping over all (params[i],grads[i])  pairs.
    updates = {}
    for param_i, grad_i in zip(params, grads):
        updates[param_i] = param_i - learning_rate * grad_i
    
    train_model = theano.function([index], cost, updates=updates,
                                  givens = {x: train_set_x, y: train_set_y},
                                  mode='FAST_RUN')

    print "    training lenet-5..."

    start_time = time.clock()
    epoch = 0 
    done_looping = False

    while (epoch < batch_size/100) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):
            iter = epoch * n_train_batches + minibatch_index
            cost_ij = train_model(minibatch_index)

    end_time = time.clock()
    print "    worker training complete."
    print "    %i samples analyzed in %.2fm" % (batch_size, 
                                                (end_time-start_time)/60.)

    return ((layer0.params[0].get_value(), layer0.params[1].get_value()),
            (layer1.params[0].get_value(), layer1.params[1].get_value()),
            (layer2.params[0].get_value(), layer2.params[1].get_value()),
            (layer3.params[0].get_value(), layer3.params[1].get_value()))