コード例 #1
0
def main():
    #    xtrain, ytrain, xvalid, yvalid, xtest = readData()
    #    xtrain, ytrain, xvalid, yvalid, xtest = cleanData(xtrain, ytrain, xvalid, yvalid, xtest)
    xtrain, ytrain, xvalid, yvalid, xtest = rmp.readCleanData()
    #    xtrain = xtrain[0:30000]
    #    ytrain = ytrain[0:30000]
    xtrain, xvalid, xtest, countVect = rmp.vectorizeWords(xtrain, xvalid, xtest)

    ytest, w, b = resumeLogisticRegression(xtrain, ytrain, xvalid, yvalid, xtest, 1)
    # 1000 epochs = 30 min

    ytest = pd.Series(ytest, index=xtest.iloc[:, 1])
    ytest.to_csv("result.csv")
    np.savetxt("w.csv", w)
    np.savetxt("b.csv", b)
    return xtrain, ytrain, xvalid, yvalid, xtest, ytest
コード例 #2
0
ファイル: LR.py プロジェクト: brycepaputa/datasuckers
def loadRMPData(useCleaned=False):
    if useCleaned:
        xtrain, ytrain, xvalid, yvalid, xtest = rmp.readCleanData()
    else:
        xtrain, ytrain, xvalid, yvalid, xtest = readData()
        xtrain, ytrain, xvalid, yvalid, xtest = cleanData(xtrain, ytrain, xvalid, yvalid, xtest)
    xtrain, xvalid, xtest, countVect = rmp.vectorizeWords(xtrain, xvalid, xtest)
    xtrain = xtrain.drop(["tid", "date", "id", xtrain.columns[0]], axis=1)
    xvalid = xvalid.drop(["tid", "date", "id", xvalid.columns[0]], axis=1)
    xtest = xtest.drop(["tid", "date", "id", xtest.columns[0]], axis=1)
    ytest = np.zeros((len(xtest),), dtype=theano.config.floatX)

    def shareData(dataX, dataY, borrow=True):
        sharedX = theano.shared(np.asarray(dataX, dtype=theano.config.floatX), borrow=True)
        sharedY = theano.shared(np.asarray(dataY, dtype=theano.config.floatX), borrow=True)
        return sharedX, T.cast(sharedY, "int32")

    xtrain, ytrain = shareData(xtrain, ytrain)
    xvalid, yvalid = shareData(xvalid, yvalid)
    xtest, ytest = shareData(xtest, ytest)
    return [(xtrain, ytrain), (xvalid, yvalid), (xtest, ytest)]
コード例 #3
0
def main():
    #TODO: normalize columns by z-score! 
#    xtrain, ytrain, xvalid, yvalid, xtest = readData()
#    xtrain, ytrain, xvalid, yvalid, xtest = cleanData(xtrain, ytrain, xvalid, yvalid, xtest)
    xtrain, ytrain, xvalid, yvalid, xtest = rmp.readCleanData(keepQuality=True)
#    xtrain = xtrain[0:30000]
#    ytrain = ytrain[0:30000]
    xtrain, xvalid, xtest, countVect = rmp.vectorizeWords(xtrain, xvalid, xtest)
    xtrain = xtrain.append(xvalid, ignore_index=True)
    xtrain = xtrain.drop(['tid', 'date', 'id', xtrain.columns[0]], axis=1)
    
    xtrain = xtrain.apply(lambda x: (x-np.mean(x))/np.std(x), axis = 0, raw=True)
    
    helpfulness = xtrain['helpfulness'].get_values()
    clarity = xtrain['clarity'].get_values()[:,0]
    easiness = xtrain['easiness'].get_values()
    quality = xtrain['quality'].get_values()[:,0]
    
    xtrain = xtrain.drop(['helpfulness', 'clarity', 'easiness', 'quality'], axis=1)
    n = len(xtrain.iloc[0,:])
    helpfulnessCorr = []
    
    for i in range(n):
        helpfulnessCorr.append(np.correlate(xtrain.iloc[:,i].get_values(), helpfulness)[0])
    clarityCorr = []
    for i in range(n):
        clarityCorr.append(np.correlate(xtrain.iloc[:,i].get_values(), clarity)[0])
    easinessCorr = []
    for i in range(n):
        easinessCorr.append(np.correlate(xtrain.iloc[:,i].get_values(), easiness)[0])
    qualityCorr = []
    for i in range(n):
        qualityCorr.append(np.correlate(xtrain.iloc[:,i].get_values(), quality)[0])
    
    correlationData = pd.DataFrame({'column': xtrain.columns, 'helpfulnessCorr': helpfulnessCorr, 'clarityCorr': clarityCorr, 'easinessCorr': easinessCorr, 'qualityCorr': qualityCorr}, columns=['column', 'helpfulnessCorr', 'clarityCorr', 'easinessCorr', 'qualityCorr'])
    
    correlationData.to_csv('correlationData.csv')
コード例 #4
0
ファイル: mnist-edit.py プロジェクト: brycepaputa/datasuckers
def sgd_optimization_rmp(learning_rate=0.13, n_epochs=1000,
                           batch_size=600):
    """
    Demonstrate stochastic gradient descent optimization of a log-linear
    model

    This is demonstrated on MNIST.

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: the path of the MNIST dataset file from
                 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz

    """
    
    xtrain, ytrain, xtest = rmp.readCleanData()
#    xtrain, ytrain, xtest = rmp.cleanData(xtrain, ytrain, xtest)
    xtrain, xtest, cv = rmp.vectorizeWords(xtrain, xtest)
    xtrain = xtrain.drop(['date', 'id', xtrain.columns[0]], axis=1)
    xtest = xtest.drop(['date', 'id', xtest.columns[0]], axis=1)
    
    index = list(xtrain.index)
    random.shuffle(index)
    xtrain = xtrain.ix[index]
    ytrain = ytrain.ix[index]
    
    xtrain.reset_index()
    ytrain.reset_index()
    
    nvalid = int(len(xtrain)/90)
    
    xvalid = theano.shared(np.array(xtrain[0:nvalid].values, dtype=np.float32), borrow=True)
    xtrain = theano.shared(np.array(xtrain[nvalid:].values, dtype=np.float32), borrow=True)
    yvalid = theano.shared(np.array(ytrain[0:nvalid].values, dtype=np.int32), borrow=True)
    ytrain = theano.shared(np.array(ytrain[nvalid:].values, dtype=np.int32), borrow=True)
    ytest = theano.shared(np.zeros((len(xtest), 1), dtype=np.int32), borrow=True)
    xtest = theano.shared(np.array(xtest, dtype=np.float32), borrow=True)
    
    # compute number of minibatches for training, validation and testing
    n_train_batches = xtrain.get_value().shape[0] / batch_size
    n_valid_batches = xvalid.get_value().shape[0] / batch_size
    n_test_batches = xtest.get_value().shape[0] / batch_size
    
    print n_train_batches
    print n_valid_batches
    print n_test_batches
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # generate symbolic variables for input (x and y represent a
    # minibatch)
    x = T.matrix('x')  # data, presented as rasterized images
    y = T.ivector('y')  # labels, presented as 1D vector of [int] labels

    # construct the logistic regression class
    # Each MNIST image has size 28*28
    classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10)

    # the cost we minimize during training is the negative log likelihood of
    # the model in symbolic format
    cost = classifier.negative_log_likelihood(y)

    # compiling a Theano function that computes the mistakes that are made by
    # the model on a minibatch
    test_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: xtest[index * batch_size: (index + 1) * batch_size],
            y: ytest[index * batch_size: (index + 1) * batch_size]
        }
    )

    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: xvalid[index * batch_size: (index + 1) * batch_size],
            y: yvalid[index * batch_size: (index + 1) * batch_size]
        }
    )

    # compute the gradient of cost with respect to theta = (W,b)
    g_W = T.grad(cost=cost, wrt=classifier.W)
    g_b = T.grad(cost=cost, wrt=classifier.b)

    # start-snippet-3
    # specify how to update the parameters of the model as a list of
    # (variable, update expression) pairs.
    updates = [(classifier.W, classifier.W - learning_rate * g_W),
               (classifier.b, classifier.b - learning_rate * g_b)]

    # compiling a Theano function `train_model` that returns the cost, but in
    # the same time updates the parameter of the model based on the rules
    # defined in `updates`
    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: xtrain[index * batch_size: (index + 1) * batch_size],
            y: ytrain[index * batch_size: (index + 1) * batch_size]
        }
    )
    # end-snippet-3

    ###############
    # TRAIN MODEL #
    ###############
    print '... training the model'
    # early-stopping parameters
    patience = 5000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                                  # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                  # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_validation_loss = np.inf
    test_score = 0.
    start_time = timeit.default_timer()

    done_looping = False
    epoch = 0
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            minibatch_avg_cost = train_model(minibatch_index)
            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                validation_losses = [validate_model(i)
                                     for i in xrange(n_valid_batches)]
                this_validation_loss = np.mean(validation_losses)

                print(
                    'epoch %i, minibatch %i/%i, validation error %f %%' %
                    (
                        epoch,
                        minibatch_index + 1,
                        n_train_batches,
                        this_validation_loss * 100.
                    )
                )

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss
                    # test it on the test set

                    test_losses = [test_model(i)
                                   for i in xrange(n_test_batches)]
                    test_score = np.mean(test_losses)

                    print(
                        (
                            '     epoch %i, minibatch %i/%i, test error of'
                            ' best model %f %%'
                        ) %
                        (
                            epoch,
                            minibatch_index + 1,
                            n_train_batches,
                            test_score * 100.
                        )
                    )

                    # save the best model
                    with open('best_model.pkl', 'w') as f:
                        cPickle.dump(classifier, f)

            if patience <= iter:
                done_looping = True
                break

    end_time = timeit.default_timer()
    print(
        (
            'Optimization complete with best validation score of %f %%,'
            'with test performance %f %%'
        )
        % (best_validation_loss * 100., test_score * 100.)
    )
    print 'The code run for %d epochs, with %f epochs/sec' % (
        epoch, 1. * epoch / (end_time - start_time))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.1fs' % ((end_time - start_time)))
コード例 #5
0
import basicLogisticRegression
import rmp
import numpy as np
import pandas as pd

#    xtrain, ytrain, xvalid, yvalid, xtest = readData()
xtrain, ytrain, xvalid, yvalid, xtest = rmp.readData()
xtrain, ytrain, xvalid, yvalid, xtest = rmp.cleanData(xtrain, ytrain, xvalid, yvalid, xtest, keepQuality=True)
#    xtrain = xtrain[0:30000]
#    ytrain = ytrain[0:30000]
xtrain, xvalid, xtest, countVect = rmp.vectorizeWords(xtrain, xvalid, xtest)

yvalid, ytest, w, b = basicLogisticRegression.resumeLogisticRegression(xtrain, ytrain, xvalid, yvalid, xtest, 1)
#1000 epochs = 30 min

ytest = pd.Series(ytest, index=xtest.iloc[:,0])
yvalid = pd.Series(yvalid, index=xvalid.iloc[:,0])
error = yvalid.get_values()-xvalid.quality.get_values()
errorWRTHC = np.zeros((5, 5))
countWRTHC = np.zeros((5, 5))
for i in range(len(xvalid)):
    errorWRTHC[xvalid.iloc[i].helpfulness-1, xvalid.iloc[i].clarity-1] += error[i]
    countWRTHC[xvalid.iloc[i].helpfulness-1, xvalid.iloc[i].clarity-1] += 1