Exemple #1
0
def train(model, optimizer, scheduler, train_dataset, val_dataset,
          n_epochs=10, batch_size=32, step=0, exp_name=None, device=1):
    step = step
    best_loss = float('inf')
    model.train()
    writer = SummaryWriter(f'runs/{exp_name}')
    loader = DataLoader(train_dataset, batch_size=batch_size, pin_memory=True, num_workers=4, shuffle=True)
    for epoch in tqdm(range(n_epochs), total=n_epochs):
        pbar = tqdm(loader, total=len(loader))
        for image, mask in pbar:
            image, mask = Variable(image).cuda(device), Variable(mask).cuda(device)
            output = model(image)
            loss = F.binary_cross_entropy_with_logits(output, mask)
            optimizer.zero_grad()
            loss.backward()
            scheduler.step()
            optimizer.step()
            pbar.set_description(f'[Loss: {loss.data[0]:.4f}]')
            writer.add_scalar('loss', loss.data[0], step)
            writer.add_scalar('lr', scheduler.lr, step)
            step += 1
        val_loss = eval(model, val_dataset, batch_size, writer, step, device)
        for name, param in model.named_parameters():
            writer.add_histogram(name, param.clone().cpu().data.numpy(), step, bins='doane')
        if val_loss < best_loss:
            best_loss = val_loss
            save(model, step, val_loss, exp_name, f'checkpoints/{exp_name}.pt')
def train(model,
          optimizer,
          scheduler,
          focal_loss,
          train_dataset,
          val_dataset=None,
          n_epochs=10,
          batch_size=32,
          step=0,
          exp_name=None,
          device=1):
    step = step
    best_loss = float('inf')
    model.train()
    writer = SummaryWriter(f'runs/{exp_name}')
    loader = DataLoader(train_dataset,
                        batch_size=batch_size,
                        pin_memory=True,
                        num_workers=4,
                        shuffle=True)
    for epoch in tqdm(range(n_epochs), total=n_epochs):
        pbar = tqdm(loader, total=len(loader))
        for images, class_labels, anchor_deltas in pbar:
            images = Variable(images).cuda()
            class_labels = Variable(class_labels).cuda()
            anchor_deltas = Variable(anchor_deltas).cuda()
            class_preds, box_preds = model(images)
            class_loss = focal_loss(class_preds, class_labels)
            box_loss = F.smooth_l1_loss(box_preds, anchor_deltas)
            loss = class_loss + box_loss
            optimizer.zero_grad()
            loss.backward()
            scheduler.step()
            optimizer.step()
            pbar.set_description(f'[Loss: {loss.data[0]:.4f}]')
            writer.add_scalar('class_loss', class_loss.data[0], step)
            writer.add_scalar('box_loss', box_loss.data[0], step)
            writer.add_scalar('loss', loss.data[0], step)
            writer.add_scalar('lr', scheduler.lr, step)
            step += 1
        val_loss = eval(model, focal_loss, val_dataset, batch_size, writer,
                        step, device) if val_dataset else float('inf')
        for name, param in model.named_parameters():
            writer.add_histogram(name,
                                 param.clone().cpu().data.numpy(),
                                 step,
                                 bins='doane')
        if val_loss <= best_loss:
            best_loss = val_loss
            save(model, step, val_loss, exp_name, f'checkpoints/{exp_name}.pt')
def categories_info():
    filepath = root + "/dataset/yelp_academic_dataset_business.json"

    """ Generate the count of reviews per category """
    business_file = open(filepath);
    lines_file = business_file.readlines();
    business_file.close();

    business_by_category = dict();
    categories_business_counts = Counter();
    categories_reviews_counts = Counter();

    for line_json in lines_file:
        business_dict = json.loads(line_json);
        business_id = business_dict["business_id"];
        categories_list = business_dict["categories"];

        for category in categories_list:
            if category not in business_by_category:
                business_by_category[category] = set();
            categories_business_counts[category] += 1
            categories_reviews_counts[category] += business_dict["review_count"];
            business_by_category[category].add(business_id);
        
    data.save(business_by_category, 'business_by_category.pkl.gz');
    data.save(categories_business_counts, 'categories_business_counts.pkl.gz');
    data.save(categories_reviews_counts, 'categories_reviews_counts.pkl.gz');
      trainer.module.params[:] = bestweights
      break
  epochs += 1

  if len(validationErrors) >= continueEpochs * 2:
      # have the validation errors started going up again?
      # compare the average of the last few to the previous few
      old = validationErrors[-continueEpochs * 2:-continueEpochs]
      new = validationErrors[-continueEpochs:]
      if min(new) > max(old):
          trainer.module.params[:] = bestweights
          break
      elif reduce(lambda x, y: x + (y - round(new[-1], convergence_threshold)), [round(y, convergence_threshold) for y in new]) == 0:
          trainer.module.params[:] = bestweights
          break

print('> Test on holdout set')
print(trainer.testOnData(testData))

# hit this command if you want to save the weights:
data.save(bestweights, bestweights_filename)
predict = np.array([np.argmax(net.activate(x)) for x, _ in testData])
realerr = float(sum(np.equal(predict, Y_test)))/len(predict)
print('> Error on test set %f' % realerr)

from evaluation import error_classification_matrix
error_classification_matrix(
  Y_test,
  predict,
)
def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
             dataset='mnist.pkl.gz', batch_size=20, n_in=28*28, n_out=10, n_hidden=500):
    """
    Demonstrate stochastic gradient descent optimization for a multilayer
    perceptron

    This is demonstrated on MNIST.

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
    gradient

    :type L1_reg: float
    :param L1_reg: L1-norm's weight when added to the cost (see
    regularization)

    :type L2_reg: float
    :param L2_reg: L2-norm's weight when added to the cost (see
    regularization)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: the path of the MNIST dataset file from
                 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz


   """
    datasets = load_data(dataset, cast=False)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '> Building model'

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.vector('y')  # the labels are presented as 1D vector of
                        # [int] labels

    rng = np.random.RandomState(1234)

    # construct the MLP class
    model = MLR(rng=rng, input=x, n_in=n_in,
                     n_hidden=n_hidden, n_out=n_out)

    # the cost we minimize during training is the negative log likelihood of
    # the model plus the regularization terms (L1 and L2); cost is expressed
    # here symbolically
    cost = model.cost(y) \
         + L1_reg * model.L1 \
         + L2_reg * model.L2_sqr

    # compiling a Theano function that computes the mistakes that are made
    # by the model on a minibatch
    test_model = theano.function(inputs=[index],
            outputs=model.errors(y),
            givens={
                x: test_set_x[index * batch_size:(index + 1) * batch_size],
                y: test_set_y[index * batch_size:(index + 1) * batch_size]})
    
    test_prediction = theano.function(inputs=[index],
            outputs=model.prediction(),
            givens={
                x: test_set_x[index * batch_size:(index + 1) * batch_size]})

    validate_model = theano.function(inputs=[index],
            outputs=model.errors(y),
            givens={
                x: valid_set_x[index * batch_size:(index + 1) * batch_size],
                y: valid_set_y[index * batch_size:(index + 1) * batch_size]})

    # compute the gradient of cost with respect to theta (sotred in params)
    # the resulting gradients will be stored in a list gparams
    gparams = []
    for param in model.params:
        gparam = T.grad(cost, param)
        gparams.append(gparam)

    # specify how to update the parameters of the model as a list of
    # (variable, update expression) pairs
    updates = []
    # given two list the zip A = [a1, a2, a3, a4] and B = [b1, b2, b3, b4] of
    # same length, zip generates a list C of same size, where each element
    # is a pair formed from the two lists :
    #    C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)]
    for param, gparam in zip(model.params, gparams):
        updates.append((param, param - learning_rate * gparam))

    # compiling a Theano function `train_model` that returns the cost, but
    # in the same time updates the parameter of the model based on the rules
    # defined in `updates`
    train_model = theano.function(inputs=[index], outputs=cost,
            updates=updates,
            givens={
                x: train_set_x[index * batch_size:(index + 1) * batch_size],
                y: train_set_y[index * batch_size:(index + 1) * batch_size]})

    ###############
    # TRAIN MODEL #
    ###############
    print '> Training'

    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = np.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            minibatch_avg_cost = train_model(minibatch_index)
            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                validation_losses = [validate_model(i) for i
                                     in xrange(n_valid_batches)]
                this_validation_loss = np.mean(validation_losses)

                print('- epoch %i, minibatch %i/%i, validation error %f %%' %
                     (epoch, minibatch_index + 1, n_train_batches,
                      this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                           improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [test_model(i) for i
                                   in xrange(n_test_batches)]
                    test_score = np.mean(test_losses)

                    print(('- epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

            if patience <= iter:
                    done_looping = True
                    break

    end_time = time.clock()
    test_result = [test_prediction(i) for i in xrange(n_test_batches)]
    test_result = np.concatenate(test_result, axis=0)
    test_target = test_set_y.eval() #[0: len(test_result)]

    data.save((test_result, test_target), 'result.pkl.gz')
 
    print(('> Optimization complete. Best validation score of %f %% '
           'obtained at iteration %i, with test performance %f %%') %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
print "> Create training set"
X_train = np.concatenate(X_train, axis=0)
Y_train = np.array(Y_train)
assert(X_train.shape[0] == len(Y_train))

print "> Create holdout set"
X_holdout = np.concatenate(X_holdout, axis=0)
Y_holdout = np.array(Y_holdout)
assert(X_holdout.shape[0] == len(Y_holdout))

permut = np.random.permutation(len(Y_holdout))
print "- Create valid set"
X_valid = X_holdout[permut[0:len(permut)/2], :]
Y_valid = Y_holdout[permut[0:len(permut)/2]]
assert(X_valid.shape[0] == len(Y_valid))

print "- Create test subet"
X_test = X_holdout[permut[len(permut)/2::], :]
Y_test = Y_holdout[permut[len(permut)/2::]]
assert(X_test.shape[0] == len(Y_test))
print "< Test sets created"


data.save((
    (X_train, Y_train), 
    (X_valid, Y_valid),
    (X_test, Y_test),
), "data.pkl.gz"
)
from utils import data

f_data_train = "slda_data_train.txt";
f_data_test = "slda_data_test.txt";

def get_corpus(filename):
    open_file = open(filename);
    lines = open_file.readlines();
    corpus = [];
    
    for line in lines:
        entries = line.split();
        doc_in_corpus = [];

        # First entry is ignored as it corresponds to the # of words
        for entry in entries[1:]:
            doc_in_corpus.append( tuple( [int(x) for x in entry.split(':') ] ) );
        corpus.append(doc_in_corpus);

    open_file.close();
    return corpus;


corpus_train = get_corpus( f_data_train );
corpus_test = get_corpus( f_data_test );

data.save(corpus_train, "lda_corpus_train.pkl.gz");
data.save(corpus_test, "lda_corpus_test.pkl.gz");

    
print("> Create training set")
X_train = np.concatenate(X_train, axis=0)
Y_train = np.array(Y_train)
assert(X_train.shape[0] == len(Y_train))

print("> Create holdout set")
X_holdout = np.concatenate(X_holdout, axis=0)
Y_holdout = np.array(Y_holdout)
assert(X_holdout.shape[0] == len(Y_holdout))

# permut = np.random.permutation(len(Y_holdout))
permut = range(len(Y_holdout));
print("- Create valid set")
X_valid = X_holdout[permut[len(permut)/2::], :]
Y_valid = Y_holdout[permut[len(permut)/2::]]
assert(X_valid.shape[0] == len(Y_valid))

print "- Create test subet"
X_test = X_holdout[permut[0:len(permut)/2], :]
Y_test = Y_holdout[permut[0:len(permut)/2]]
assert(X_test.shape[0] == len(Y_test))
print "< Test sets created"


data.save((
    (X_train, Y_train), 
    (X_valid, Y_valid),
    (X_test, Y_test),
), training_filename
)
    data = np.array(data)
    indices = np.array(indices)
    indptr = np.array(indptr)
    target = np.array(target)

    # generating csr matrix
    csr_train = csr_matrix( (data, indices, indptr), shape = (idoc, max(indices)+1 ) )
    return (csr_train,target)

###########
# Process #
###########

print('> Generating training matrix ')
X_train, Y_train = generateScipyCSRMatrix(chosenGenerator, dataset_train_filename)
data.save((X_train, Y_train), file_csr_train);
print('> Generating test matrix ')
X_test, Y_test = generateScipyCSRMatrix(chosenGenerator, dataset_test_filename)
data.save((X_test, Y_test), file_csr_test);

# fitting the linear regression model
print('> Fitting the model ')
# Actually performing the linear regression
alpha_opt_word_freq = 70.
lin_reg_model = linear_model.SGDRegressor(
  eta0=0.04, # starting learning rate
  n_iter=300, # max number of epochs
  shuffle=True, 
  verbose=0, 
  alpha=0.000000, # regularization constant
);
def generate_slda_data(filename="yelp_academic_dataset_review_training.json", n_reviews=None, category=None):

    filepath = root + "/dataset/" + filename

    """ A couple of useful initializations """
    # Chris Potts tokenizer.
    tok = tokenizer.Tokenizer(preserve_case=False)
    # min and max ngram sizes
    MIN_NGRAM = 1
    MAX_NGRAM = 1
    word_set = set()
    # set of unique words
    word2idx = dict()
    # mapping from word to int representation of a word
    ratings_list = []
    reviews_list = []
    reviews = []
    data_list = []
    words_distr = dict()
    words_counts = Counter()

    """ PHASE 1 : Load file and get set of all words """
    stopwords = nltk.corpus.stopwords.words("english")
    print " PHASE 1 : Get all words "
    loaded_file = open(filepath)
    lines_file = loaded_file.readlines()
    if n_reviews == None:
        n_reviews = len(lines_file)
    loaded_file.close()
    i_review = 1

    # we randomly select n_reviews from the dataset
    permutation = np.random.permutation(len(lines_file))
    sample_reviews = permutation[0:n_reviews]

    for idx_review in sample_reviews:
        line_json = lines_file[idx_review]
        review_dict = json.loads(line_json)
        tokens_list = tok.ngrams(review_dict["text"], MIN_NGRAM, MAX_NGRAM, string=True)
        rating = review_dict["stars"]
        for token in tokens_list:
            if token not in stopwords:
                """
              if token not in words_distr:
                words_distr[token] = Counter({5:0, 4:0, 3:0, 2:0, 1:0}); 
              words_distr[token][rating] += 1;
              """
                words_counts[token] += 1

        reviews_list.append(Counter(tokens_list))
        ratings_list.append(review_dict["stars"] - 1)
        word_set |= set(tokens_list)
        disp.tempPrint(str(i_review))
        i_review += 1

    """ PHASE 2 : Word to int conversion """
    filter_threshold = 0.00001 * (max(words_counts.values()) * 1.0)
    print " PHASE 2 : Word to int conversion "
    i_word = 1
    for word in word_set:
        if words_counts[word] >= filter_threshold:
            word2idx[word] = i_word
            disp.tempPrint(str(i_word))
            i_word += 1
    print "    Filtered. Before : %d words. After : %d" % (len(word_set), len(word2idx))

    """ PHASE 3 : Converting data to the right format """
    print " PHASE 3 : Converting data to the right format "
    i_review = 1
    for review in reviews_list:
        nwords = 0
        data_line = ""
        for word in review:
            if word in word2idx:
                data_line += " " + str(word2idx[word]) + ":" + str(review[word])
                nwords += 1
        data_line += "\n"
        if nwords != 0:
            data_line = str(nwords) + " " + data_line
            data_list.append(data_line)
            disp.tempPrint(str(i_review))
            i_review += 1

    """ PHASE 4 : Save into right files """
    print " PHASE 4 : Save into right files "
    n_reviews = len(data_list)
    idx_test = n_reviews * 8 / 10

    if category:
        category = "_" + category
    else:
        cateogory = ""

    data_train = open("/tmp/slda_data_train" + category + ".txt", "w")
    label_train = open("/tmp/slda_label_train" + category + ".txt", "w")

    data_test = open("/tmp/slda_data_test" + category + ".txt", "w")
    label_test = open("/tmp/slda_label_test" + category + ".txt", "w")

    for i_review in range(idx_test):
        data_train.write(data_list[i_review])
        label_train.write(str(ratings_list[i_review]) + "\n")

    for i_review in range(idx_test, n_reviews):
        data_test.write(data_list[i_review])
        label_test.write(str(ratings_list[i_review]) + "\n")

    data_train.close()
    data_test.close()
    label_train.close()
    label_test.close()

    """ PHASE 5 : Save useful datastructures """
    print " PHASE 5 : Save useful datastructures "
    data.save(reviews_list, "/tmp/slda_reviews" + category + ".pkl.gz")
    data.save(ratings_list, "/tmp/slda_ratings" + category + ".pkl.gz")
    data.save(word2idx, "/tmp/slda_word2idx" + category + ".pkl.gz")