Beispiel #1
0
def main(train_path, valid_path, test_path, save_path):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on t-labels,
        2. on y-labels,
        3. on y-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        save_path: Path to save predictions.
    """
    output_path_true = save_path.replace(WILDCARD, 'true')
    output_path_naive = save_path.replace(WILDCARD, 'naive')
    output_path_adjusted = save_path.replace(WILDCARD, 'adjusted')

    # *** START CODE HERE ***
    # Part (a): Train and test on true labels
    x_train, t_train = util.load_dataset(train_path,
                                         label_col='t',
                                         add_intercept=True)
    model = LogisticRegression()
    model.fit(x_train, t_train)
    x_test, t_test = util.load_dataset(test_path,
                                       label_col='t',
                                       add_intercept=True)
    t_pred = model.predict(x_test)
    util.plot(x_test, t_test, model.theta, '{}.png'.format(output_path_true))
    np.savetxt(output_path_true, t_pred)
    # Make sure to save predicted probabilities to output_path_true using np.savetxt()
    # Part (b): Train on y-labels and test on true labels
    x_train, y_train = util.load_dataset(train_path,
                                         label_col='y',
                                         add_intercept=True)
    model = LogisticRegression()
    model.fit(x_train, y_train)
    x_test, t_test = util.load_dataset(test_path,
                                       label_col='t',
                                       add_intercept=True)
    t_pred = model.predict(x_test)
    util.plot(x_test, t_test, model.theta, '{}.png'.format(output_path_naive))
    np.savetxt(output_path_naive, t_pred)
    # Make sure to save predicted probabilities to output_path_naive using np.savetxt()
    # Part (f): Apply correction factor using validation set and test on true labels
    x_val, y_val = util.load_dataset(valid_path,
                                     label_col='y',
                                     add_intercept=True)
    h_val = model.predict(x_val)
    alpha = np.mean(h_val[y_val == 1])
    py_test = model.predict(x_test)
    pt_test = py_test / alpha
    util.plot(x_test,
              t_test,
              model.theta,
              '{}.png'.format(output_path_adjusted),
              correction=alpha)
    np.savetxt(output_path_adjusted, pt_test)
Beispiel #2
0
def main(train_path, valid_path, test_path, save_path):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on t-labels,
        2. on y-labels,
        3. on y-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        save_path: Path to save predictions.
    """
    output_path_true = save_path.replace(WILDCARD, 'true')
    output_path_naive = save_path.replace(WILDCARD, 'naive')
    output_path_adjusted = save_path.replace(WILDCARD, 'adjusted')

    # *** START CODE HERE ***
    # Part (a): Train and test on true labels
    x_train, y_train_t = util.load_dataset(train_path,
                                           label_col='t',
                                           add_intercept=True)
    model = LogisticRegression()

    # Fit model on true labels
    model.fit(x_train, y_train_t)

    x_val, y_val_t = util.load_dataset(valid_path,
                                       label_col='t',
                                       add_intercept=True)

    # Make sure to save predicted probabilities to output_path_true using np.savetxt()
    np.savetxt(output_path_true, model.predict(x_val))
    util.plot(x_val, y_val_t, model.theta, output_path_true[:-4])

    # Part (b): Train on y-labels and test on true labels
    _, y_train_y = util.load_dataset(train_path,
                                     label_col='y',
                                     add_intercept=True)

    model = LogisticRegression()

    # Train model on y-labels
    model.fit(x_train, y_train_y)

    # Make sure to save predicted probabilities to output_path_naive using np.savetxt()
    np.savetxt(output_path_naive, model.predict(x_val))
    util.plot(x_val, y_val_t, model.theta, output_path_naive[:-4])
def logregA_varying_regularization(lam, regul1):
    pa_list = []
    ta_list = []
    total_ta = 0
    total_pa = 0

    for i in range(5):

        Log_ob = LogisticRegression(regLambda=lam, regNorm=regul1)

        Log_ob.fit(folds_X_complete[i], folds_y_complete[i])

        y_test = Log_ob.predict(X_test[i])
        pa_score = accuracy_score(y_test, y_complete[i])
        pa_list.append(pa_score)

        y_train = Log_ob.predict(folds_X_complete[i])
        ta_score = accuracy_score(y_train, folds_y_complete[i])
        ta_list.append(ta_score)

        total_pa = total_pa + pa_score
        total_ta = total_ta + ta_score

    pa = total_pa / 5
    ta = total_ta / 5

    return pa, ta, pa_list, ta_list
Beispiel #4
0
    def __init__(self, rng, input, n_in, n_hidden, n_hidden_2, n_out):
        
        self.hiddenLayer = HiddenLayer(
            rng=rng,
            input=input,
            n_in=n_in,
            n_out=n_hidden,
            activation=T.tanh
        )

        self.hiddenLayer2 = HiddenLayer(
            rng=rng,
            input=self.hiddenLayer.output,
            n_in=n_hidden,
            n_out=n_hidden_2,
            activation=T.tanh
        )

        self.logRegressionLayer = LogisticRegression(
            input=self.hiddenLayer2.output,
            n_in=n_hidden_2,
            n_out=n_out
        )

        # L1 norm
        self.L1 = (
            abs(self.hiddenLayer.W).sum() +
            abs(self.hiddenLayer2.W).sum() +
            abs(self.logRegressionLayer.W).sum()
        )
        print 'self.L1={}'.format(self.L1)

        # square of L2 norm
        self.L2_sqr = (
            (self.hiddenLayer.W ** 2).sum() +
            (self.hiddenLayer2.W ** 2).sum() +
            (self.logRegressionLayer.W ** 2).sum()
        )
        print 'self.L2_sqr={}'.format(self.L2_sqr)

        # Negative log likelihood
        self.negative_log_likelihood = (
            self.logRegressionLayer.negative_log_likelihood
        )
        print 'self.negative_log_likelihood={}'.format(self.negative_log_likelihood)

        self.errors = self.logRegressionLayer.errors

        self.params = (
            self.hiddenLayer.params +
            self.hiddenLayer2.params +
            self.logRegressionLayer.params
        )
        print 'self.params={}'.format(self.params)

        self.input = input
Beispiel #5
0
def main(train_path, valid_path, test_path, save_path):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on t-labels,
        2. on y-labels,
        3. on y-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        save_path: Path to save predictions.
    """
    output_path_true = save_path.replace(WILDCARD, 'true')
    output_path_naive = save_path.replace(WILDCARD, 'naive')
    output_path_adjusted = save_path.replace(WILDCARD, 'adjusted')

    # Part (a):
    x_train, t_train = util.load_dataset(train_path, 't', add_intercept=True)
    x_test, t_test = util.load_dataset(test_path, 't', add_intercept=True)
    clf = LogisticRegression()
    clf.fit(x_train, t_train)
    util.plot(x_test, t_test, clf.theta, 'posonly-true.jpg')
    np.savetxt(output_path_true, clf.predict(x_test))

    # Part (b):
    x_train, y_train = util.load_dataset(train_path, add_intercept=True)
    x_test, y_test = util.load_dataset(test_path, add_intercept=True)
    x_valid, y_valid = util.load_dataset(valid_path, add_intercept=True)
    clf = LogisticRegression()
    clf.fit(x_train, y_train)
    util.plot(x_test, t_test, clf.theta, 'posonly-naive.jpg')
    np.savetxt(output_path_naive, clf.predict(x_test))

    # Part (f):
    alpha = np.mean(clf.predict(x_valid[y_valid == 1]))
    np.savetxt(output_path_adjusted, clf.predict(x_test) / alpha)
    clf.theta[0] += np.log(2 / alpha - 1)
    util.plot(x_test, t_test, clf.theta, 'posonly_adjusted.jpg')
Beispiel #6
0
def learn():

    stoplist = makeStoplist()
    features = extractFeaturesFromFile(stoplist=stoplist)
    vectorizer = TfidfVectorizer(encoding=ENCODING)
    X_train = vectorizer.fit_transform(
        [" ".join(feature[1:]) for feature in features])
    y_train = np.zeros(len(features))
    for i in range(len(features)):
        if features[i][0] == "+1":
            y_train[i] = 1
    clf = LogisticRegression()
    clf.fit(X_train, y_train)

    io.savemat("X_train", {"X_train": X_train})
    np.save("y_train", y_train)
    joblib.dump(vectorizer, "tfidf.vec")
    clf.save("logreg")
Beispiel #7
0
        plt.plot(costs_train[i],
                 "--",
                 color=color,
                 label="Train, lambda = {:g}".format(lmbda))
        plt.plot(costs_test[i],
                 color=color,
                 label="Test, lambda = {:g}".format(lmbda))

    plt.legend(loc="upper right")
    plt.savefig("results/cost_lmbda.pdf")
    plt.show()

if mode == "logreg":
    batch_size = 100
    n_batches = int(Xtrain.shape[0] / batch_size)
    logReg = LogisticRegression(n_batches=n_batches, allow_early_stop=False)

    etas = [1, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5]
    acc_list = []

    accuracys_train = []
    costs_train = []
    accuracys_test = []
    costs_test = []

    for eta in etas:
        a, b, c, d = logReg.fit(Xtrain,
                                ytrain,
                                eta=eta,
                                n_epochs=2000,
                                Xtest=Xtest,
if __name__ == "__main__":
    # Load Data
    filename = 'data/data1.dat'
    data = loadtxt(filename, delimiter=',')
    X = data[:, 0:2]
    y = np.array([data[:, 2]]).T
    n, d = X.shape

    # Standardize the data
    mean = X.mean(axis=0)
    std = X.std(axis=0)
    X = (X - mean) / std

    # train logistic regression
    logregModel = LogisticRegression(regLambda=0.0001)
    logregModel.fit(X, y)

    # Plot the decision boundary
    h = .02  # step size in the mesh
    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    Z = logregModel.predict(np.c_[xx.ravel(), yy.ravel()])
    print Z

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure(1, figsize=(4, 3))
    plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)
Beispiel #9
0
import time

import numpy as np
from scipy import io
from sklearn.externals import joblib
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from logreg import LogisticRegression

if __name__ == "__main__":
    X_train = io.loadmat("X_train")["X_train"]
    X_train = X_train.tocsr()  #疎行列の種類の変更(tfidfVectorizerで出力されるものと同じものにする)
    y_train = np.load("y_train.npy")
    kf = KFold(n_splits=5)

    start = time.time()
    for (i, (train, test)) in enumerate(kf.split(X_train), start=1):
        clf = LogisticRegression()
        clf.fit(X_train[train], y_train[train])
        y_predict = clf.predict(X_train[test])
        y_test = y_train[test]
        print("Fold %d" % i)
        print("正解率: %f" % accuracy_score(y_test, y_predict))
        print("適合率: %f" % precision_score(y_test, y_predict))
        print("再現率: %f" % recall_score(y_test, y_predict))
        print("F1スコア: %f" % f1_score(y_test, y_predict))
        print("")
    elapsed_time = time.time() - start
    print(str(elapsed_time) + "[sec]")
Beispiel #10
0
if __name__ == '__main__':

    # Create parser
    p = Parser()

    # Create training dataset
    ds = p.create_dataset("en-ud-train-projective.conllu", train=True)
    model_file = 'model.pkl'
    # model_file = 'model_t800.pkl'
    # Train LR model

    if os.path.exists(model_file):
        # if model exists, load from file
        print("Loading existing model...")
        lr = pickle.load(open(model_file, 'rb'))
    else:
        # train model using minibatch GD
        lr = LogisticRegression()
        lr.fit(*ds.to_arrays())
        pickle.dump(lr, open(model_file, 'wb'))
    
    # Create test dataset
    test_ds = p.create_dataset("en-ud-dev.conllu")
    # Copy feature maps to ensure that test datapoints are encoded in the same way
    test_ds.copy_feature_maps(ds)
    # Compute move-level accuracy
    lr.classify_datapoints(*test_ds.to_arrays())
    
    # Compute UAS and sentence-level accuracy
    t = TreeConstructor(p)
    t.evaluate(lr, 'en-ud-dev.conllu', ds)
Beispiel #11
0
def main(train_path, valid_path, test_path, save_path):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on t-labels,
        2. on y-labels,
        3. on y-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        save_path: Path to save predictions.
    """
    output_path_true = save_path.replace(WILDCARD, 'true')
    output_path_naive = save_path.replace(WILDCARD, 'naive')
    output_path_adjusted = save_path.replace(WILDCARD, 'adjusted')

    # *** START CODE HERE ***

    def image_path(path):
        return path[:-3] + "png"

    # Part (a): Train and test on true labels
    # Make sure to save predicted probabilities to output_path_true using np.savetxt()
    x_train, t_train = util.load_dataset(train_path,
                                         label_col="t",
                                         add_intercept=True)
    x_test, t_test = util.load_dataset(test_path,
                                       label_col="t",
                                       add_intercept=True)
    model = LogisticRegression()
    model.fit(x_train, t_train)

    prob_test = model.predict(x_test)
    np.savetxt(output_path_true, prob_test)
    util.plot(x_test,
              t_test,
              model.theta,
              save_path=image_path(output_path_true))

    # Part (b): Train on y-labels and test on true labels
    # Make sure to save predicted probabilities to output_path_naive using np.savetxt()
    x_train, y_train = util.load_dataset(train_path,
                                         label_col="y",
                                         add_intercept=True)
    x_test, y_test = util.load_dataset(test_path,
                                       label_col="y",
                                       add_intercept=True)

    model = LogisticRegression()
    model.fit(x_train, y_train)

    prob_test = model.predict(x_test)
    np.savetxt(output_path_naive, prob_test)

    util.plot(x_test,
              t_test,
              model.theta,
              save_path=image_path(output_path_naive))
    # Part (f): Apply correction factor using validation set and test on true labels
    # Plot and use np.savetxt to save outputs to output_path_adjusted
    # Estimate alpha
    x_val, y_val = util.load_dataset(valid_path,
                                     label_col="y",
                                     add_intercept=True)
    model = LogisticRegression()
    model.fit(x_train, y_train)
    h_val = model.predict(x_val)
    alpha = np.mean(h_val[y_val == 1])  # Mean over positive y samples.
    # Adjustment
    py_test = model.predict(x_test)
    pt_test = py_test / alpha
    np.savetxt(output_path_adjusted, pt_test)
    # Plot
    util.plot(x_test,
              t_test,
              model.theta,
              save_path=image_path(output_path_adjusted),
              correction=alpha)
Beispiel #12
0
    def __init__(self, rng, input, n_in, n_hidden, n_out):
        """Initialize the parameters for the multilayer perceptron

        :type rng: numpy.random.RandomState
        :param rng: a random number generator used to initialize weights

        :type input: theano.tensor.TensorType
        :param input: symbolic variable that describes the input of the
        architecture (one minibatch)

        :type n_in: int
        :param n_in: number of input units, the dimension of the space in
        which the datapoints lie

        :type n_hidden: int
        :param n_hidden: number of hidden units

        :type n_out: int
        :param n_out: number of output units, the dimension of the space in
        which the labels lie

        """

        # Since we are dealing with a one hidden layer MLP, this will translate
        # into a HiddenLayer with a tanh activation function connected to the
        # LogisticRegression layer; the activation function can be replaced by
        # sigmoid or any other nonlinear function
        self.hiddenLayer = HiddenLayer(rng=rng,
                                       input=input,
                                       n_in=n_in,
                                       n_out=n_hidden,
                                       activation=T.tanh)

        # The logistic regression layer gets as input the hidden units
        # of the hidden layer
        self.logRegressionLayer = LogisticRegression(
            input=self.hiddenLayer.output, n_in=n_hidden, n_out=n_out)
        # end-snippet-2 start-snippet-3
        # L1 norm ; one regularization option is to enforce L1 norm to
        # be small
        self.L1 = (abs(self.hiddenLayer.W).sum() +
                   abs(self.logRegressionLayer.W).sum())

        # square of L2 norm ; one regularization option is to enforce
        # square of L2 norm to be small
        self.L2_sqr = ((self.hiddenLayer.W**2).sum() +
                       (self.logRegressionLayer.W**2).sum())

        # negative log likelihood of the MLP is given by the negative
        # log likelihood of the output of the model, computed in the
        # logistic regression layer
        self.negative_log_likelihood = (
            self.logRegressionLayer.negative_log_likelihood)
        # same holds for the function computing the number of errors
        self.errors = self.logRegressionLayer.errors

        # the parameters of the model are the parameters of the two layer it is
        # made out of
        self.params = self.hiddenLayer.params + self.logRegressionLayer.params
        # end-snippet-3

        # keep track of model input
        self.input = input
Beispiel #13
0
index = 27
plt.imshow(train_set_x_orig[index])
plt.show()
print ("y = " + str(train_set_y[:, index]) + ", it's a '" + classes[np.squeeze(train_set_y[:, index])].decode("utf-8") +  "' picture.")
'''

# Flatten the images
train_set_x_flatten = train_set_x_orig.reshape(train_set_x_orig.shape[0], -1).T
test_set_x_flatten = test_set_x_orig.reshape(test_set_x_orig.shape[0], -1).T

# Normalise image values
train_set_x = train_set_x_flatten / 255.
test_set_x = test_set_x_flatten / 255.

# Create model instance
model = LogisticRegression()

# Fit model to the data
model.fit(train_set_x, train_set_y)

# Train the model
model.train(2400, verbose=True)

# Predict values
predictions = model.predict(test_set_x)

# Check accuracy
model.print_accuracy(predictions, test_set_y)

# Plot training loss
model.plot_cost()
def main():

    print "############# Load Datasets ##############"

    import stanfordSentimentTreebank as sst

    skip_unknown_words = bool(args.get("--skip"))
    shuffle_flag = bool(args.get("--shuffle"))
    datatype = args.get("--datatype")
    if datatype == 5:
        # Fine-grained 5-class
        n_class = 5
    elif datatype == 2:
        # Binary 2-class
        n_class = 2

    # print "skip_unknown_words",skip_unknown_words
    vocab, index2word, datasets, datasets_all_sentences, funcs = sst.load_stanfordSentimentTreebank_dataset(normalize=True, skip_unknown_words=skip_unknown_words, datatype=datatype)
    train_set, test_set, dev_set  = datasets
    train_set_sentences, test_set_sentences, dev_set_sentences = datasets_all_sentences
    get,sentence2ids, ids2sentence = funcs # 関数を読み込み
    scores, sentences = zip(*train_set_sentences)
    sentences = [[word for word in sentence.lower().split()] for sentence in sentences]
    vocab_size = len(vocab)

 
    dev_unknown_count  = sum([unknown_word_count for score,(ids,unknown_word_count) in dev_set])
    test_unknown_count = sum([unknown_word_count for score,(ids,unknown_word_count) in test_set])

    train_set = [(score, ids) for score,(ids,unknown_word_count) in train_set]
    test_set  = [(score, ids) for score,(ids,unknown_word_count) in test_set]
    dev_set   = [(score, ids) for score,(ids,unknown_word_count) in dev_set]

    print "train_size : ", len(train_set)
    print "dev_size   : ", len(dev_set)
    print "test_size  : ", len(test_set)
    print "-"*30
    print "vocab_size: ", len(vocab)
    print "dev_unknown_words  : ", dev_unknown_count
    print "test_unknown_words : ", test_unknown_count



    
    print args

    # EMB_DIM = 50
    EMB_DIM = args.get("--emb_size")
    vocab_size = len(vocab)


    feat_map_n_1 = args.get("--feat_map_n_1")
    feat_map_n_final = args.get("--feat_map_n_final")

    height = 1
    width1 = args.get("--width1")
    width2 = args.get("--width2")
    k_top  = args.get("--k_top")
    n_class = n_class
    alpha   = args.get("--alpha")
    n_epoch = args.get("--n_epoch")
    dropout_rate0 = args.get("--dropout_rate0")
    dropout_rate1 = args.get("--dropout_rate1")
    dropout_rate2 = args.get("--dropout_rate2")
    activation = args.get("--activation")
    learn      = args.get("--learn")
    number_of_convolutinal_layer = 2
    use_regular = bool(args.get("--use_regular"))
    regular_c   = args.get("--regular_c")

    pretrain = args.get('--pretrain')
    if pretrain == 'word2vec':
        print "*Using word2vec"
        embeddings_W, model = pretrained_embedding.use_word2vec(sentences=sentences, index2word=index2word, emb_dim=EMB_DIM)
        # -0.5 ~ 0.5で初期化している
    elif pretrain == 'glove':
        print "*Using glove"
        embeddings_W = pretrained_embedding.use_glove(sentences=sentences, index2word=index2word, emb_dim=EMB_DIM, model_file='glove_model/glove_50_iter2900.model')
    else:
        embeddings_W = np.asarray(
            rng.normal(0, 0.05, size = (vocab_size, EMB_DIM)), 
            dtype = theano.config.floatX
        )
        embeddings_W[0,:] = 0

    print np.amax(embeddings_W)
    print np.amin(embeddings_W)
    # print "*embeddings"
    print embeddings_W
    # print bool(embeddings)

    # input_x = [1, 3, 4, 5, 0, 22, 4, 5]

    print "############# Model Setting ##############"    
    x = T.imatrix('x')
    length_x = T.iscalar('length_x')
    y = T.ivector('y') # the sentence sentiment label
    embeddings = WordEmbeddingLayer(rng=rng, 
                            input=x,
                            vocab_size=vocab_size, embed_dm=EMB_DIM, embeddings=embeddings_W)


    def dropout(X, p=0.5):
        if p > 0:
            retain_prob = 1 - p
            X *= srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX)
            # X /= retain_prob
        return X
    # number_of_convolutinal_layer = theano.shared(number_of_convolutinal_layer)
    # dynamic_func = theano.function(inputs=[length_x], outputs=number_of_convolutinal_layer * length_x)

    # dynamic_func_test = theano.function(
    #     inputs = [length_x],
    #     outputs = dynamic_func(length_x),
    #     )
    # print dynamic_func(len([1,2,3]))

    l1 = DynamicConvFoldingPoolLayer(rng, 
                              input = dropout(embeddings.output, p=dropout_rate0), 
                              filter_shape = (feat_map_n_1, 1, height, width1),  # two feature map, height: 1, width: 2, 
                              k_top = k_top,
                              number_of_convolutinal_layer=number_of_convolutinal_layer,
                              index_of_convolitonal_layer=1,
                              length_x=length_x,
                              activation = activation
    )
    l1_no_dropout = DynamicConvFoldingPoolLayer(rng, 
                              input = embeddings.output,
                              W=l1.W * (1 - dropout_rate0),
                              b=l1.b,
                              filter_shape = (feat_map_n_1, 1, height, width1),  # two feature map, height: 1, width: 2, 
                              k_top = k_top,
                              number_of_convolutinal_layer=number_of_convolutinal_layer,
                              index_of_convolitonal_layer=1,
                              length_x=length_x,
                              activation = activation
    )


    l2 = DynamicConvFoldingPoolLayer(rng, 
                              input = dropout(l1.output, p=dropout_rate1), 
                              filter_shape = (feat_map_n_final, feat_map_n_1, height, width2),
                              # two feature map, height: 1, width: 2, 
                              k_top = k_top,
                              number_of_convolutinal_layer=number_of_convolutinal_layer,
                              index_of_convolitonal_layer=2,
                              length_x=length_x,
                              activation = activation
    )
    l2_no_dropout = DynamicConvFoldingPoolLayer(rng, 
                              input = l1_no_dropout.output,
                              W=l2.W * (1 - dropout_rate1),
                              b=l2.b,
                              filter_shape = (feat_map_n_final, feat_map_n_1, height, width2),
                              # two feature map, height: 1, width: 2, 
                              k_top = k_top,
                              number_of_convolutinal_layer=number_of_convolutinal_layer,
                              index_of_convolitonal_layer=2,
                              length_x=length_x,
                              activation = activation
    )


    # l2_output = theano.function(
    #     inputs = [x,length_x],
    #     outputs = l2.output,
    #     # on_unused_input='ignore'
    # ) 

    # TODO:
    # check the dimension
    # input: 1 x 1 x 6 x 4
    # out = l2_output(
    #     np.array([input_x], dtype = np.int32),
    #     len(input_x),
    # )


    # test = theano.function(
    #     inputs = [x],
    #     outputs = embeddings.output,
    # ) 


    # print "--input--"
    # print np.array([input_x], dtype = np.int32).shape
    # print "--input embeddings--"
    # a = np.array([input_x], dtype = np.int32)
    # print test(a).shape
    # print "-- output --"
    # print out
    # print out.shape



    # x = T.dscalar("x")
    # b = T.dscalar("b")
    # a = 1
    # f = theano.function(inputs=[x,b], outputs=b * x + a)
    # print f(2,2)


    # expected = (1, feat_map_n, EMB_DIM / 2, k)
    # assert out.shape == expected, "%r != %r" %(out.shape, expected)

    ##### Test Part Three ###############
    # LogisticRegressionLayer
    #################################

    # print "############# LogisticRegressionLayer ##############"

    l_final = LogisticRegression(
        rng, 
        input = dropout(l2.output.flatten(2), p=dropout_rate2),
        n_in = feat_map_n_final * k_top * EMB_DIM,
        # n_in = feat_map_n * k * EMB_DIM / 2, # we fold once, so divide by 2
        n_out = n_class, # five sentiment level
    )

    l_final_no_dropout = LogisticRegression(
        rng, 
        input = l2_no_dropout.output.flatten(2),
        W = l_final.W * (1 - dropout_rate2),
        b = l_final.b,
        n_in = feat_map_n_final * k_top * EMB_DIM,
        # n_in = feat_map_n * k * EMB_DIM / 2, # we fold once, so divide by 2
        n_out = n_class, # five sentiment level
    )


    print "n_in : ", feat_map_n_final * k_top * EMB_DIM
    # print "n_in = %d" %(2 * 2 * math.ceil(EMB_DIM / 2.))


    # p_y_given_x = theano.function(
    #     inputs = [x, length_x],
    #     outputs = l_final.p_y_given_x,
    #     allow_input_downcast=True,
    #     # mode = "DebugMode"
    # )

    # print "p_y_given_x = "
    # print p_y_given_x(
    #     np.array([input_x], dtype=np.int32),
    #     len(input_x)
    # )

    cost = theano.function(
        inputs = [x, length_x, y],
        outputs = l_final.nnl(y),
        allow_input_downcast=True,
        # mode = "DebugMode"
    )

    # print "cost:\n", cost(
    #     np.array([input_x], dtype = np.int32), 
    #     len(input_x),
    #     np.array([1], dtype = np.int32)
    # )

    
    print "############# Learning ##############"

    from sgd import sgd, rmsprop, adagrad, adadelta, adam
    from regularizer import regularize_l2

    layers = []
    layers.append(embeddings)
    layers.append(l1)
    layers.append(l2)
    layers.append(l_final)


    cost = l_final.nnl(y)
    params = [p for layer in layers for p in layer.params]
    param_shapes = [l.param_shapes for l in layers]
    param_grads = [T.grad(cost, param) for param in params]

    # regularizer setting
    regularizers = {}
    regularizers['c'] = regular_c # 2.0, 4.0, 15.0
    regularizers['func'] = [None for _ in range(len(params))]
    if use_regular:
        regularizers_func = []
        regularizers_func.append([regularize_l2(l=0.0001)]) # [embeddings]
        regularizers_func.append([regularize_l2(l=0.00003), None]) # [W, b]
        regularizers_func.append([regularize_l2(l=0.000003), None]) # [W, b]
        regularizers_func.append([regularize_l2(l=0.0001), None]) # [logreg_W, logreg_b]
        regularizers_func = [r_func for r in regularizers_func for r_func in r]
        regularizers['func'] = regularizers_func

    # if third conv layer: 1e-5
    
    print embeddings.params
    print l1.params
    print l2.params
    print l_final.params




    # updates = sgd(cost, l_final.params)
    # RegE = 1e-4
    # print param_grads
    if learn == "sgd":
        updates = sgd(cost, params, lr=0.05)
    elif learn == "adam":
        updates = adam(loss_or_grads=cost, params=params, learning_rate=alpha, regularizers=regularizers)
    elif learn == "adagrad":
        updates = adagrad(loss_or_grads=cost, params=params, learning_rate=alpha, regularizers=regularizers)
    elif learn == "adadelta":
        updates = adadelta(loss_or_grads=cost, params=params, regularizers=regularizers)
    elif learn == "rmsprop":
        updates = rmsprop(loss_or_grads=cost, params=params, learning_rate=alpha, regularizers=regularizers)


    train = theano.function(inputs=[x, length_x, y], outputs=cost, updates=updates, allow_input_downcast=True)
    # predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)
    predict = theano.function(
        inputs = [x, length_x],
        outputs = T.argmax(l_final_no_dropout.p_y_given_x, axis=1),
        allow_input_downcast=True,
        # mode = "DebugMode"
    )




    def b(x_data):
        return np.array(x_data, dtype=np.int32)


    def test(test_set):
        # print "############# TEST ##############"
        y_pred = []
        test_set_y = []
        # for train_x, train_y in zip(X_data, Y_data):
        # print test_set
        # Accuracy_count = 0
        for test_y,test_x in test_set:
            test_x = b([test_x])
            p = predict(test_x, len(test_x))[0]
            y_pred.append(p)
            test_set_y.append(test_y)

            # if test_y == p:
            #     Accuracy_count += 1

            # print "*predict :",predict(train_x, len(train_x)), train_y 
        # Accuracy = float(Accuracy_count) / len(test_set)
        # print "  accuracy : %f" % Accuracy, 
        return accuracy_score(test_set_y, y_pred)
        # print classification_report(test_set_y, y_pred)

    # train_set_rand = np.ndarray(train_set)
    train_set_rand = train_set[:]
    train_cost_sum = 0.0
    for epoch in xrange(n_epoch):
        print "== epoch : %d =="  % epoch
        if shuffle_flag:
            np.random.shuffle(train_set_rand)
            # train_set_rand = np.random.permutation(train_set)
        for i,x_y_set in enumerate(train_set_rand):
            train_y, train_x = x_y_set
            train_x = b([train_x])
            train_y = b([train_y])

            train_cost = train(train_x, len(train_x) , train_y)
            train_cost_sum += train_cost
            if i % 1000 == 0 or i == len(train_set)-1:
                print "i : (%d/%d)" % (i, len(train_set)) , 
                print " (cost : %f )" % train_cost
        
        print '  cost :', train_cost_sum
        print '  train_set : %f' % test(train_set)
        print '  dev_set   : %f' % test(dev_set)
        print '  test_set  : %f' % test(test_set)





    '''
Beispiel #15
0
print out
print out.shape

expected = (1, feat_map_n, EMB_DIM / 2, k)
assert out.shape == expected, "%r != %r" % (out.shape, expected)

##### Test Part Three ###############
# LogisticRegressionLayer
#################################

print "############# LogisticRegressionLayer ##############"

l3 = LogisticRegression(
    rng,
    input=l2.output.flatten(2),
    n_in=feat_map_n * k * EMB_DIM / 2,  # we fold once, so divide by 2
    n_out=5  # five sentiment level
)

print "n_in = %d" % (2 * 2 * math.ceil(EMB_DIM / 2.))

y = T.ivector('y')  # the sentence sentiment label

p_y_given_x = theano.function(inputs=[x],
                              outputs=l3.p_y_given_x,
                              mode="DebugMode")

print "p_y_given_x = "
print p_y_given_x(np.array([[1, 3, 4, 5], [0, 1, 4, 7]], dtype=np.int32))

cost = theano.function(inputs=[x, y], outputs=l3.nnl(y), mode="DebugMode")
Beispiel #16
0
    def __init__(self, x, y, vocab_size, embed_dim, label_n):
        """
        x: theano.tensor.imatrix, (minibatch size, 3)
            the tree matrix of the minibatch
            for each row, (node id, left child id, right child id)

        y: theano.tensor.ivector, (minibatch size,)
            the labels

        vocab_size: int
            vocabulary size, including both the words and phrases
        
        embed_dim: int
            the embedding dimension

        """
        assert x.ndim == 2
        assert y.ndim == 1

        parent_ids = x[:, 0]
        children_ids = x[:, 1:]

        rng = np.random.RandomState(1234)

        self.embedding = theano.shared(
            value=rng.normal(0, 0.05, (vocab_size, embed_dim)),
            name='embedding',
            borrow=True,
        )

        self.rntn_layer = RNTNLayer(rng, embed_dim)

        # Update the embedding by
        # forwarding the embedding from bottom to up
        # and getting the vector for each node in each tree

        def update_embedding(child_indices, my_index, embedding):

            assert child_indices.ndim == 1
            assert my_index.ndim == 0

            return T.switch(
                T.eq(
                    child_indices[0], -1
                ),  # NOTE: not using all() because it's non-differentiable
                embedding,  # if no child, return the word embedding
                T.set_subtensor(
                    embedding[
                        my_index],  # otherwise, compute the embedding of RNTN layer
                    self.rntn_layer.output(embedding[child_indices[0]],
                                           embedding[child_indices[1]])))

        final_embedding, updates = theano.scan(
            fn=update_embedding,
            sequences=[children_ids, parent_ids],
            outputs_info=self.
            embedding,  # we should pass the whole matrix and fill in the positions if necessary
        )

        self.update_embedding = theano.function(
            inputs=[x],
            updates=[(self.embedding,
                      T.set_subtensor(self.embedding[parent_ids],
                                      final_embedding[-1][parent_ids]))])

        # the logistic regression layer that predicts the label
        self.logreg_layer = LogisticRegression(
            rng,
            input=final_embedding[-1][parent_ids],
            n_in=embed_dim,
            n_out=label_n)

        cost = self.logreg_layer.nnl(y)

        params = self.logreg_layer.params + self.rntn_layer.params + [
            self.embedding
        ]
        self.params = params

        param_shapes = self.logreg_layer.param_shapes + self.rntn_layer.param_shapes + [
            (vocab_size, embed_dim)
        ]

        grads = [T.grad(cost=cost, wrt=p) for p in params]

        updates = build_adadelta_updates(params,
                                         param_shapes,
                                         grads,
                                         epsilon=0.1)

        # TODO: in this step, forward propagation is done again besides the one in `update_embedding`
        #       this extra computation should be avoided
        self.train = theano.function(inputs=[x, y], updates=updates)
    filename = 'data/data2.dat'
    data = loadtxt(filename, delimiter=',')
    X = data[:, 0:2]
    y = np.array([data[:, 2]]).T
    n, d = X.shape

    # Standardize the data
    mean = X.mean(axis=0)
    std = X.std(axis=0)
    X = (X - mean) / std

    # map features into a higher dimensional feature space
    X = mapFeature(X[:, 0], X[:, 1])

    # train logistic regression
    logregModel = LogisticRegression()
    logregModel.fit(X, y)

    # reload the data for 2D plotting purposes
    data = loadtxt(filename, delimiter=',')
    PX = data[:, 0:2]
    y = data[:, 2]

    # Standardize the data
    mean = PX.mean(axis=0)
    std = PX.std(axis=0)
    PX = (PX - mean) / std

    # Plot the decision boundary
    h = .02  # step size in the mesh
    x_min, x_max = PX[:, 0].min() - .5, PX[:, 0].max() + .5
Beispiel #18
0
def main(train_path, validation_path, save_path):
    """Problem 2: Logistic regression for imbalanced labels.

    Run under the following conditions:
        1. naive logistic regression
        2. upsampling minority class

    Args:
        train_path: Path to CSV file containing training set.
        validation_path: Path to CSV file containing validation set.
        save_path: Path to save predictions.
    """
    output_path_naive = save_path.replace(WILDCARD, 'naive')
    output_path_upsampling = save_path.replace(WILDCARD, 'upsampling')

    # *** START CODE HERE ***
    # Part (b): Vanilla logistic regression
    # Make sure to save predicted probabilities to output_path_naive using np.savetxt()
    print("Vanilla Logistic Regression:")
    x_train, y_train = util.load_dataset(train_path, add_intercept=True)

    x_val, y_val = util.load_dataset(validation_path, add_intercept=True)

    clf = LogisticRegression()
    clf.fit(x_train, y_train)
    y_predict = clf.predict(x_val)

    np.savetxt(output_path_naive, y_predict)
    y_predict = y_predict >= 0.5
    util.plot(x_val, y_predict, clf.theta, output_path_naive[:-4])

    accuracy = np.mean(y_predict == y_val)
    A_0 = np.sum((y_predict == 0) * (y_val == 0)) / np.sum(y_val == 0)
    A_1 = np.sum((y_predict == 1) * (y_val == 1)) / np.sum(y_val == 1)
    balanced_accuracy = 0.5 * (A_0 + A_1)
    print("Accuracy: {},\nAccuracy for class 0: {},\nAccuracy for class 1: {},"
          "\nBalanced Accuracy: {}".format(accuracy, A_0, A_1,
                                           balanced_accuracy))

    #plot the real expected outcome from the validation:
    util.plot(x_val, y_val, clf.theta, output_path_naive[:-4] + "validation")
    # Part (d): Upsampling minority class
    # Make sure to save predicted probabilities to output_path_upsampling using np.savetxt()
    # Repeat minority examples 1 / kappa times
    num_add = int(1 / kappa) - 1

    x_train = np.concatenate(
        (x_train, np.repeat(x_train[y_train == 1, :], num_add, axis=0)),
        axis=0)
    y_train = np.concatenate(
        (y_train, np.repeat(y_train[y_train == 1], num_add, axis=0)), axis=0)

    x_val, y_val = util.load_dataset(validation_path, add_intercept=True)

    clf = LogisticRegression()
    clf.fit(x_train, y_train)
    y_predict = clf.predict(x_val)

    np.savetxt(output_path_upsampling, y_predict)
    y_predict = y_predict >= 0.5
    util.plot(x_val, y_predict, clf.theta, output_path_upsampling[:-4])

    accuracy = np.mean(y_predict == y_val)
    A_0 = np.sum((y_predict == 0) * (y_val == 0)) / np.sum(y_val == 0)
    A_1 = np.sum((y_predict == 1) * (y_val == 1)) / np.sum(y_val == 1)
    balanced_accuracy = 0.5 * (A_0 + A_1)
    print("Accuracy: {},\nAccuracy for class 0: {},\nAccuracy for class 1: {},"
          "\nBalanced Accuracy: {}".format(accuracy, A_0, A_1,
                                           balanced_accuracy))
    #plot the real expected outcome from the validation:
    util.plot(x_val, y_val, clf.theta,
              output_path_upsampling[:-4] + "validation")
Beispiel #19
0
def main(train_path, valid_path, test_path, save_path):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on t-labels,
        2. on y-labels,
        3. on y-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        save_path: Path to save predictions.
    """
    output_path_true = save_path.replace(WILDCARD, 'true')
    output_path_naive = save_path.replace(WILDCARD, 'naive')
    output_path_adjusted = save_path.replace(WILDCARD, 'adjusted')

    # *** START CODE HERE ***
    # Part (a): Train and test on true labels
    x_train, y_train = util.load_dataset(train_path,
                                         label_col='t',
                                         add_intercept=True)

    model_true = LogisticRegression()
    model_true.fit(x_train, y_train)

    x_test, y_test = util.load_dataset(test_path,
                                       label_col='t',
                                       add_intercept=True)

    util.plot(x_test, y_test, model_true.theta, 'plot_5a.png')

    # Make sure to save predicted probabilities to output_path_true using np.savetxt()
    np.savetxt(output_path_true, model_true.predict(x_test))

    # Part (b): Train on y-labels and test on true labels
    x_train, y_train = util.load_dataset(train_path,
                                         label_col='y',
                                         add_intercept=True)

    model_naive = LogisticRegression()
    model_naive.fit(x_train, y_train)

    x_test, y_test = util.load_dataset(test_path,
                                       label_col='y',
                                       add_intercept=True)
    util.plot(x_test, y_test, model_naive.theta, 'plot_5b.png')

    # Make sure to save predicted probabilities to output_path_naive using np.savetxt()
    np.savetxt(output_path_naive, model_naive.predict(x_test))

    # Part (f): Apply correction factor using validation set and test on true labels
    x_valid, y_valid = util.load_dataset(valid_path,
                                         label_col='t',
                                         add_intercept=True)

    x_index = np.where(y_valid == 1)

    alpha = 1 / len(y_valid[y_valid == 1]) * np.sum(
        model_naive.predict((x_valid[x_index])))

    x_test, y_test = util.load_dataset(test_path,
                                       label_col='y',
                                       add_intercept=True)

    util.plot(x_test,
              y_test,
              model_naive.theta,
              'plot_5f.png',
              correction=alpha)

    np.savetxt(output_path_adjusted, model_naive.predict(x_test) * alpha)
Beispiel #20
0
#coding:utf-8

from sklearn.externals import joblib
from logreg import LogisticRegression

ENCODING = "cp1252"

if __name__ == "__main__":

    vectorizer = joblib.load("tfidf.vec")
    clf = LogisticRegression("logreg")
    terms = vectorizer.get_feature_names()
    index_list = list(range(len(terms)))
    index_list.sort(key=lambda i: clf.coef_[i])

    print("top 10")
    for i in index_list[:-11:-1]:
        print(terms[i], clf.coef_[i])

    print("")

    print("worst 10")
    for i in index_list[:10]:
        print(terms[i], clf.coef_[i])
Beispiel #21
0
layer2 = ConvFoldingPoolLayer(rng=rng,
                              input=layer1.output,
                              filter_shape=filter_shape,
                              k=k,
                              fold=1,
                              W=theano.shared(value=W, name="W"),
                              b=theano.shared(value=b, name="b"))

n_in = filter_shape[0] * k * embed_dm / 2
n_out = 5
W_logreg = np.asarray(np.random.rand(n_in, n_out), dtype=theano.config.floatX)
b_logreg = np.asarray(np.random.rand(n_out), dtype=theano.config.floatX)

layer3 = LogisticRegression(rng=rng,
                            input=layer2.output.flatten(2),
                            n_in=n_in,
                            n_out=n_out,
                            W=theano.shared(value=W_logreg, name="W_logreg"),
                            b=theano.shared(value=b_logreg, name="b_logreg"))

f1 = theano.function(inputs=[x_symbol, y_symbol], outputs=layer3.nnl(y_symbol))

f2 = theano.function(inputs=[x_symbol, y_symbol],
                     outputs=layer3.errors(y_symbol))

f3 = theano.function(inputs=[x_symbol], outputs=layer3.p_y_given_x)

f_el = theano.function(inputs=[x_symbol], outputs=layer1.output)

f_cl = theano.function(inputs=[x_symbol], outputs=layer2.output)

#########################
def train_and_test(args, print_config):

    assert args.conv_layer_n == len(args.filter_widths) == len(
        args.nkerns) == (len(args.L2_regs) - 2) == len(args.fold_flags) == len(
            args.ks)

    # \mod{dim, 2^{\sum fold_flags}} == 0
    assert args.embed_dm % (2**sum(args.fold_flags)) == 0

    ###################
    # get the data    #
    ###################
    datasets = load_data(args.corpus_path)

    train_set_x, train_set_y = datasets[0]
    dev_set_x, dev_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]
    word2index = datasets[3]
    index2word = datasets[4]
    pretrained_embeddings = datasets[5]

    n_train_batches = train_set_x.get_value(
        borrow=True).shape[0] / args.batch_size
    n_dev_batches = dev_set_x.get_value(
        borrow=True).shape[0] / args.dev_test_batch_size
    n_test_batches = test_set_x.get_value(
        borrow=True).shape[0] / args.dev_test_batch_size

    train_sent_len = train_set_x.get_value(borrow=True).shape[1]
    possible_labels = set(train_set_y.get_value().tolist())

    if args.use_pretrained_embedding:
        args.embed_dm = pretrained_embeddings.get_value().shape[1]

    ###################################
    # Symbolic variable definition    #
    ###################################
    x = T.imatrix('x')  # the word indices matrix
    y = T.ivector('y')  # the sentiment labels

    batch_index = T.iscalar('batch_index')

    rng = np.random.RandomState(1234)

    ###############################
    # Construction of the network #
    ###############################
    # Layer 1, the embedding layer
    layer1 = WordEmbeddingLayer(
        rng,
        input=x,
        vocab_size=len(word2index),
        embed_dm=args.embed_dm,
        embeddings=(pretrained_embeddings
                    if args.use_pretrained_embedding else None))

    dropout_layers = [layer1]
    layers = [layer1]

    for i in range(args.conv_layer_n):
        fold_flag = args.fold_flags[i]

        # for the dropout layer
        dpl = DropoutLayer(input=dropout_layers[-1].output,
                           rng=rng,
                           dropout_rate=args.dropout_rates[0])
        next_layer_dropout_input = dpl.output
        next_layer_input = layers[-1].output

        # for the conv layer
        filter_shape = (args.nkerns[i], (1 if i == 0 else args.nkerns[i - 1]),
                        1, args.filter_widths[i])

        k = args.ks[i]

        print "For conv layer(%s) %d, filter shape = %r, k = %d, dropout_rate = %f and normalized weight init: %r and fold: %d" % (
            args.conv_activation_unit, i + 2, filter_shape, k,
            args.dropout_rates[i], args.norm_w, fold_flag)

        # we have two layers adding to two paths repsectively,
        # one for training
        # the other for prediction(averaged model)

        dropout_conv_layer = ConvFoldingPoolLayer(
            rng,
            input=next_layer_dropout_input,
            filter_shape=filter_shape,
            k=k,
            norm_w=args.norm_w,
            fold=fold_flag,
            activation=args.conv_activation_unit)

        # for prediction
        # sharing weight with dropout layer
        conv_layer = ConvFoldingPoolLayer(
            rng,
            input=next_layer_input,
            filter_shape=filter_shape,
            k=k,
            activation=args.conv_activation_unit,
            fold=fold_flag,
            W=dropout_conv_layer.W *
            (1 - args.dropout_rates[i]),  # model averaging
            b=dropout_conv_layer.b)

        dropout_layers.append(dropout_conv_layer)
        layers.append(conv_layer)

    # last, the output layer
    # both dropout and without dropout
    if sum(args.fold_flags) > 0:
        n_in = args.nkerns[-1] * args.ks[-1] * args.embed_dm / (2**sum(
            args.fold_flags))
    else:
        n_in = args.nkerns[-1] * args.ks[-1] * args.embed_dm

    print "For output layer, n_in = %d, dropout_rate = %f" % (
        n_in, args.dropout_rates[-1])

    dropout_output_layer = LogisticRegression(
        rng,
        input=dropout_layers[-1].output.flatten(2),
        n_in=n_in,  # divided by 2x(how many times are folded)
        n_out=len(possible_labels)  # five sentiment level
    )

    output_layer = LogisticRegression(
        rng,
        input=layers[-1].output.flatten(2),
        n_in=n_in,
        n_out=len(possible_labels),
        W=dropout_output_layer.W *
        (1 - args.dropout_rates[-1]),  # sharing the parameters, don't forget
        b=dropout_output_layer.b)

    dropout_layers.append(dropout_output_layer)
    layers.append(output_layer)

    ###############################
    # Error and cost              #
    ###############################
    # cost and error come from different model!
    dropout_cost = dropout_output_layer.nnl(y)
    errors = output_layer.errors(y)

    def prepare_L2_sqr(param_layers, L2_regs):
        assert len(L2_regs) == len(param_layers)
        return T.sum([
            L2_reg / 2 *
            ((layer.W if hasattr(layer, "W") else layer.embeddings)**2).sum()
            for L2_reg, layer in zip(L2_regs, param_layers)
        ])

    L2_sqr = prepare_L2_sqr(dropout_layers, args.L2_regs)
    L2_sqr_no_ebd = prepare_L2_sqr(dropout_layers[1:], args.L2_regs[1:])

    if args.use_L2_reg:
        cost = dropout_cost + L2_sqr
        cost_no_ebd = dropout_cost + L2_sqr_no_ebd
    else:
        cost = dropout_cost
        cost_no_ebd = dropout_cost

    ###############################
    # Parameters to be used       #
    ###############################
    print "Delay embedding learning by %d epochs" % (
        args.embedding_learning_delay_epochs)

    print "param_layers: %r" % dropout_layers
    param_layers = dropout_layers

    ##############################
    # Parameter Update           #
    ##############################
    print "Using AdaDelta with rho = %f and epsilon = %f" % (args.rho,
                                                             args.epsilon)

    params = [param for layer in param_layers for param in layer.params]
    param_shapes = [
        param for layer in param_layers for param in layer.param_shapes
    ]

    param_grads = [T.grad(cost, param) for param in params]

    # AdaDelta parameter update
    # E[g^2]
    # initialized to zero
    egs = [
        theano.shared(value=np.zeros(param_shape, dtype=theano.config.floatX),
                      borrow=True,
                      name="Eg:" + param.name)
        for param_shape, param in zip(param_shapes, params)
    ]

    # E[\delta x^2], initialized to zero
    exs = [
        theano.shared(value=np.zeros(param_shape, dtype=theano.config.floatX),
                      borrow=True,
                      name="Ex:" + param.name)
        for param_shape, param in zip(param_shapes, params)
    ]

    new_egs = [
        args.rho * eg + (1 - args.rho) * g**2
        for eg, g in zip(egs, param_grads)
    ]

    delta_x = [
        -(T.sqrt(ex + args.epsilon) / T.sqrt(new_eg + args.epsilon)) * g
        for new_eg, ex, g in zip(new_egs, exs, param_grads)
    ]

    new_exs = [
        args.rho * ex + (1 - args.rho) * (dx**2)
        for ex, dx in zip(exs, delta_x)
    ]

    egs_updates = zip(egs, new_egs)
    exs_updates = zip(exs, new_exs)
    param_updates = [(p, p + dx)
                     for dx, g, p in zip(delta_x, param_grads, params)]

    updates = egs_updates + exs_updates + param_updates

    # updates WITHOUT embedding
    # exclude the embedding parameter
    egs_updates_no_ebd = zip(egs[1:], new_egs[1:])
    exs_updates_no_ebd = zip(exs[1:], new_exs[1:])
    param_updates_no_ebd = [
        (p, p + dx) for dx, g, p in zip(delta_x, param_grads, params)[1:]
    ]
    updates_no_emb = egs_updates_no_ebd + exs_updates_no_ebd + param_updates_no_ebd

    def make_train_func(cost, updates):
        return theano.function(
            inputs=[batch_index],
            outputs=[cost],
            updates=updates,
            givens={
                x:
                train_set_x[batch_index * args.batch_size:(batch_index + 1) *
                            args.batch_size],
                y:
                train_set_y[batch_index * args.batch_size:(batch_index + 1) *
                            args.batch_size]
            })

    train_model_no_ebd = make_train_func(cost_no_ebd, updates_no_emb)
    train_model = make_train_func(cost, updates)

    def make_error_func(x_val, y_val):
        return theano.function(
            inputs=[],
            outputs=errors,
            givens={
                x: x_val,
                y: y_val
            },
        )

    dev_error = make_error_func(dev_set_x, dev_set_y)

    test_error = make_error_func(test_set_x, test_set_y)

    #############################
    # Debugging purpose code    #
    #############################
    # : PARAMETER TUNING NOTE:
    # some demonstration of the gradient vanishing probelm

    train_data_at_index = {
        x:
        train_set_x[batch_index * args.batch_size:(batch_index + 1) *
                    args.batch_size],
    }

    train_data_at_index_with_y = {
        x:
        train_set_x[batch_index * args.batch_size:(batch_index + 1) *
                    args.batch_size],
        y:
        train_set_y[batch_index * args.batch_size:(batch_index + 1) *
                    args.batch_size]
    }

    if print_config["nnl"]:
        get_nnl = theano.function(
            inputs=[batch_index],
            outputs=dropout_cost,
            givens={
                x:
                train_set_x[batch_index * args.batch_size:(batch_index + 1) *
                            args.batch_size],
                y:
                train_set_y[batch_index * args.batch_size:(batch_index + 1) *
                            args.batch_size]
            })

    if print_config["L2_sqr"]:
        get_L2_sqr = theano.function(inputs=[], outputs=L2_sqr)

        get_L2_sqr_no_ebd = theano.function(inputs=[], outputs=L2_sqr_no_ebd)

    if print_config["grad_abs_mean"]:
        print_grads = theano.function(
            inputs=[],
            outputs=[
                theano.printing.Print(param.name)(T.mean(T.abs_(param_grad)))
                for param, param_grad in zip(params, param_grads)
            ],
            givens={
                x: train_set_x,
                y: train_set_y
            })

    activations = [l.output for l in dropout_layers[1:-1]]
    weight_grads = [T.grad(cost, l.W) for l in dropout_layers[1:-1]]

    if print_config["activation_hist"]:
        # turn into 1D array
        get_activations = theano.function(
            inputs=[batch_index],
            outputs=[val.flatten(1) for val in activations],
            givens=train_data_at_index)

    if print_config["weight_grad_hist"]:
        # turn into 1D array
        get_weight_grads = theano.function(
            inputs=[batch_index],
            outputs=[val.flatten(1) for val in weight_grads],
            givens=train_data_at_index_with_y)

    if print_config["activation_tracking"]:
        # get the mean and variance of activations for each conv layer

        get_activation_mean = theano.function(
            inputs=[batch_index],
            outputs=[T.mean(val) for val in activations],
            givens=train_data_at_index)

        get_activation_std = theano.function(
            inputs=[batch_index],
            outputs=[T.std(val) for val in activations],
            givens=train_data_at_index)

    if print_config["weight_grad_tracking"]:
        # get the mean and variance of activations for each conv layer
        get_weight_grad_mean = theano.function(
            inputs=[batch_index],
            outputs=[T.mean(g) for g in weight_grads],
            givens=train_data_at_index_with_y)

        get_weight_grad_std = theano.function(
            inputs=[batch_index],
            outputs=[T.std(g) for g in weight_grads],
            givens=train_data_at_index_with_y)

    #the training loop
    patience = args.patience  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant

    validation_frequency = min(n_train_batches, patience / 2)

    best_validation_loss = np.inf
    best_iter = 0

    start_time = time.clock()
    done_looping = False
    epoch = 0

    nnls = []
    L2_sqrs = []

    activation_means = [[] for i in range(args.conv_layer_n)]
    activation_stds = [[] for i in range(args.conv_layer_n)]
    weight_grad_means = [[] for i in range(args.conv_layer_n)]
    weight_grad_stds = [[] for i in range(args.conv_layer_n)]
    activation_hist_data = [[] for i in range(args.conv_layer_n)]
    weight_grad_hist_data = [[] for i in range(args.conv_layer_n)]

    train_errors = []
    dev_errors = []
    try:
        print "validation_frequency = %d" % validation_frequency
        while (epoch < args.n_epochs):
            epoch += 1
            print "At epoch {0}".format(epoch)

            if epoch == (args.embedding_learning_delay_epochs + 1):
                print "########################"
                print "Start training embedding"
                print "########################"

            # shuffle the training data
            train_set_x_data = train_set_x.get_value(borrow=True)
            train_set_y_data = train_set_y.get_value(borrow=True)

            permutation = np.random.permutation(
                train_set_x.get_value(borrow=True).shape[0])

            train_set_x.set_value(train_set_x_data[permutation])
            train_set_y.set_value(train_set_y_data[permutation])
            for minibatch_index in range(n_train_batches):
                if epoch >= (args.embedding_learning_delay_epochs + 1):
                    train_cost = train_model(minibatch_index)
                else:
                    train_cost = train_model_no_ebd(minibatch_index)

                iter = (epoch - 1) * n_train_batches + minibatch_index

                if (iter + 1) % validation_frequency == 0:

                    # train_error_val = np.mean([train_error(i)
                    #                            for i in range(n_train_batches)])
                    dev_error_val = dev_error()

                    # print "At epoch %d and minibatch %d. \nTrain error %.2f%%\nDev error %.2f%%\n" %(
                    #     epoch,
                    #     minibatch_index,
                    #     train_error_val * 100,
                    #     dev_error_val * 100
                    # )

                    print "At epoch %d and minibatch %d. \nDev error %.2f%%\n" % (
                        epoch, minibatch_index, dev_error_val * 100)

                    # train_errors.append(train_error_val)
                    dev_errors.append(dev_error_val)

                    if dev_error_val < best_validation_loss:
                        best_iter = iter
                        #improve patience if loss improvement is good enough
                        if dev_error_val < best_validation_loss *  \
                           improvement_threshold:
                            patience = max(patience, iter * patience_increase)

                        best_validation_loss = dev_error_val

                        test_error_val = test_error()

                        print(('     epoch %i, minibatch %i/%i, test error of'
                               ' best dev error %f %%') %
                              (epoch, minibatch_index + 1, n_train_batches,
                               test_error_val * 100.))

                        print "Dumping model to %s" % (args.model_path)
                        dump_params(params, args.model_path)

                if (minibatch_index +
                        1) % 50 == 0 or minibatch_index == n_train_batches - 1:
                    print "%d / %d minibatches completed" % (
                        minibatch_index + 1, n_train_batches)
                    if print_config["nnl"]:
                        print "`nnl` for the past 50 minibatches is %f" % (
                            np.mean(np.array(nnls)))
                        nnls = []
                    if print_config["L2_sqr"]:
                        print "`L2_sqr`` for the past 50 minibatches is %f" % (
                            np.mean(np.array(L2_sqrs)))
                        L2_sqrs = []

                ##################
                # Plotting stuff #
                ##################
                if print_config["nnl"]:
                    nnl = get_nnl(minibatch_index)
                    # print "nll for batch %d: %f" %(minibatch_index, nnl)
                    nnls.append(nnl)

                if print_config["L2_sqr"]:
                    if epoch >= (args.embedding_learning_delay_epochs + 1):
                        L2_sqrs.append(get_L2_sqr())
                    else:
                        L2_sqrs.append(get_L2_sqr_no_ebd())

                if print_config["activation_tracking"]:
                    layer_means = get_activation_mean(minibatch_index)
                    layer_stds = get_activation_std(minibatch_index)
                    for layer_ms, layer_ss, layer_m, layer_s in zip(
                            activation_means, activation_stds, layer_means,
                            layer_stds):
                        layer_ms.append(layer_m)
                        layer_ss.append(layer_s)

                if print_config["weight_grad_tracking"]:
                    layer_means = get_weight_grad_mean(minibatch_index)
                    layer_stds = get_weight_grad_std(minibatch_index)

                    for layer_ms, layer_ss, layer_m, layer_s in zip(
                            weight_grad_means, weight_grad_stds, layer_means,
                            layer_stds):
                        layer_ms.append(layer_m)
                        layer_ss.append(layer_s)

                if print_config["activation_hist"]:
                    for layer_hist, layer_data in zip(
                            activation_hist_data,
                            get_activations(minibatch_index)):
                        layer_hist += layer_data.tolist()

                if print_config["weight_grad_hist"]:
                    for layer_hist, layer_data in zip(
                            weight_grad_hist_data,
                            get_weight_grads(minibatch_index)):
                        layer_hist += layer_data.tolist()

    except:
        import traceback
        traceback.print_exc(file=sys.stdout)
    finally:
        from plot_util import (plot_hist, plot_track, plot_error_vs_epoch, plt)

        if print_config["activation_tracking"]:
            plot_track(activation_means, activation_stds,
                       "activation_tracking")

        if print_config["weight_grad_tracking"]:
            plot_track(weight_grad_means, weight_grad_stds,
                       "weight_grad_tracking")

        if print_config["activation_hist"]:
            plot_hist(activation_hist_data, "activation_hist")

        if print_config["weight_grad_hist"]:
            plot_hist(weight_grad_hist_data, "weight_grad_hist")

        if print_config["error_vs_epoch"]:
            train_errors = [0] * len(dev_errors)
            ax = plot_error_vs_epoch(
                train_errors,
                dev_errors,
                title=('Best dev score: %f %% '
                       ' at iter %i with test error %f %%') %
                (best_validation_loss * 100., best_iter + 1,
                 test_error_val * 100.))
        if not args.task_signature:
            plt.show()
        else:
            plt.savefig("plots/" + args.task_signature + ".png")

    end_time = time.clock()

    print(('Optimization complete. Best validation score of %f %% '
           'obtained at iteration %i, with test performance %f %%') %
          (best_validation_loss * 100., best_iter + 1, test_error_val * 100.))

    # save the result
    with open(args.output, "a") as f:
        f.write("%s\t%f\t%f\n" %
                (args.task_signature, best_validation_loss, test_error_val))

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Beispiel #23
0
    filename = 'data/data2.dat'
    data = loadtxt(filename, delimiter=',')
    X = data[:, 0:2]
    y = np.array([data[:, 2]]).T
    n, d = X.shape

    # Standardize the data
    mean = X.mean(axis=0)
    std = X.std(axis=0)
    X = (X - mean) / std

    # map features into a higher dimensional feature space
    X = mapFeature(X[:, 0], X[:, 1])

    # train logistic regression
    logregModel = LogisticRegression(regLambda=10)
    logregModel.fit(X, y)

    # reload the data for 2D plotting purposes
    data = loadtxt(filename, delimiter=',')
    PX = data[:, 0:2]
    y = data[:, 2]

    # Standardize the data
    mean = PX.mean(axis=0)
    std = PX.std(axis=0)
    PX = (PX - mean) / std

    # Plot the decision boundary
    h = .02  # step size in the mesh
    x_min, x_max = PX[:, 0].min() - .5, PX[:, 0].max() + .5
Beispiel #24
0
    if args.test:
        test_file = args.test
        test = pd.read_csv(test_file)

    if test_file is None:
        print("Splitting train to accomodate for test set.")
        train, test = train_test_split(train, test_size=0.2)

    train_Y = train['labels'].values
    train_X = train.drop(['labels'], axis=1).values

    test_Y = test['labels'].values
    test_X = test.drop(['labels'], axis=1).values

    print(train_X.shape, train_Y.shape, test_X.shape, test_Y.shape)
    logreg = LogisticRegression(learning_rate=lr,
                                epochs=epochs,
                                initialiser=init,
                                verbose=verbose)
    logreg.fit(train_X, train_Y)
    predictions = logreg.predict(test_X)

    if args.output == ".":
        args.output = os.getcwd()
    with open(args.output + "/classification_report.txt", 'w') as f:
        f.write(str(classification_report(test_Y, predictions)))

    test['predictions'] = predictions
    test.to_csv(args.output + "/predictions.csv")
Beispiel #25
0
def main(train_path, valid_path, test_path, save_path):
    """Problem 2: Logistic regression for incomplete, positive-only labels.

    Run under the following conditions:
        1. on t-labels,
        2. on y-labels,
        3. on y-labels with correction factor alpha.

    Args:
        train_path: Path to CSV file containing training set.
        valid_path: Path to CSV file containing validation set.
        test_path: Path to CSV file containing test set.
        save_path: Path to save predictions.
    """
    output_path_true = save_path.replace(WILDCARD, 'true')
    output_path_naive = save_path.replace(WILDCARD, 'naive')
    output_path_adjusted = save_path.replace(WILDCARD, 'adjusted')

    # *** START CODE HERE ***
    # Part (a): Train and test on true labels
    x_train, y_train = util.load_dataset(train_path,
                                         add_intercept=True,
                                         label_col='t')
    x_valid, y_valid = util.load_dataset(valid_path,
                                         add_intercept=True,
                                         label_col='t')
    from logreg import LogisticRegression
    clf = LogisticRegression()
    clf.fit(x_train, y_train)
    print(clf.theta)

    fig, ax = plt.subplots(1, 1, figsize=(12, 8))

    ax.scatter(x_valid[:, 1], x_valid[:, 2], c=y_valid.astype(np.int))
    ax.set_ylim(x_valid[:, 2].min(), x_valid[:, 2].max())
    plot_decision_line(clf.theta, x_valid, ax)
    plt.savefig("posonly_all_observed.png")
    plt.show()

    # Make sure to save predicted probabilities to output_path_true using np.savetxt()
    # Part (b): Train on y-labels and test on true labels
    x_train, y_train = util.load_dataset(train_path,
                                         add_intercept=True,
                                         label_col='y')
    x_valid, y_valid = util.load_dataset(valid_path,
                                         add_intercept=True,
                                         label_col='y')
    from logreg import LogisticRegression
    clf = LogisticRegression()
    clf.fit(x_train, y_train)
    print(clf.theta)

    fig, ax = plt.subplots(1, 1, figsize=(12, 8))

    ax.scatter(x_valid[:, 1], x_valid[:, 2], c=y_valid.astype(np.int))
    ax.set_ylim(x_valid[:, 2].min(), x_valid[:, 2].max())
    plot_decision_line(clf.theta, x_valid, ax)
    plt.savefig("naive_training_partial.png")
    plt.show()
    # Make sure to save predicted probabilities to output_path_naive using np.savetxt()
    # Part (f): Apply correction factor using validation set and test on true labels
    clf = LogisticRegression()
    clf.fit(x_train, y_train)

    #decition
    y_pred = clf.predict(x_valid)
    print(y_pred)

    fig, ax = plt.subplots(1, 1, figsize=(12, 8))

    ax.scatter(x_valid[:, 1], x_valid[:, 2], c=y_valid.astype(np.int))
    ax.set_ylim(x_valid[:, 2].min(), x_valid[:, 2].max())
    plt.show()