def train(model, optimizer, scheduler, train_dataset, val_dataset, n_epochs=10, batch_size=32, step=0, exp_name=None, device=1): step = step best_loss = float('inf') model.train() writer = SummaryWriter(f'runs/{exp_name}') loader = DataLoader(train_dataset, batch_size=batch_size, pin_memory=True, num_workers=4, shuffle=True) for epoch in tqdm(range(n_epochs), total=n_epochs): pbar = tqdm(loader, total=len(loader)) for image, mask in pbar: image, mask = Variable(image).cuda(device), Variable(mask).cuda(device) output = model(image) loss = F.binary_cross_entropy_with_logits(output, mask) optimizer.zero_grad() loss.backward() scheduler.step() optimizer.step() pbar.set_description(f'[Loss: {loss.data[0]:.4f}]') writer.add_scalar('loss', loss.data[0], step) writer.add_scalar('lr', scheduler.lr, step) step += 1 val_loss = eval(model, val_dataset, batch_size, writer, step, device) for name, param in model.named_parameters(): writer.add_histogram(name, param.clone().cpu().data.numpy(), step, bins='doane') if val_loss < best_loss: best_loss = val_loss save(model, step, val_loss, exp_name, f'checkpoints/{exp_name}.pt')
def train(model, optimizer, scheduler, focal_loss, train_dataset, val_dataset=None, n_epochs=10, batch_size=32, step=0, exp_name=None, device=1): step = step best_loss = float('inf') model.train() writer = SummaryWriter(f'runs/{exp_name}') loader = DataLoader(train_dataset, batch_size=batch_size, pin_memory=True, num_workers=4, shuffle=True) for epoch in tqdm(range(n_epochs), total=n_epochs): pbar = tqdm(loader, total=len(loader)) for images, class_labels, anchor_deltas in pbar: images = Variable(images).cuda() class_labels = Variable(class_labels).cuda() anchor_deltas = Variable(anchor_deltas).cuda() class_preds, box_preds = model(images) class_loss = focal_loss(class_preds, class_labels) box_loss = F.smooth_l1_loss(box_preds, anchor_deltas) loss = class_loss + box_loss optimizer.zero_grad() loss.backward() scheduler.step() optimizer.step() pbar.set_description(f'[Loss: {loss.data[0]:.4f}]') writer.add_scalar('class_loss', class_loss.data[0], step) writer.add_scalar('box_loss', box_loss.data[0], step) writer.add_scalar('loss', loss.data[0], step) writer.add_scalar('lr', scheduler.lr, step) step += 1 val_loss = eval(model, focal_loss, val_dataset, batch_size, writer, step, device) if val_dataset else float('inf') for name, param in model.named_parameters(): writer.add_histogram(name, param.clone().cpu().data.numpy(), step, bins='doane') if val_loss <= best_loss: best_loss = val_loss save(model, step, val_loss, exp_name, f'checkpoints/{exp_name}.pt')
def categories_info(): filepath = root + "/dataset/yelp_academic_dataset_business.json" """ Generate the count of reviews per category """ business_file = open(filepath); lines_file = business_file.readlines(); business_file.close(); business_by_category = dict(); categories_business_counts = Counter(); categories_reviews_counts = Counter(); for line_json in lines_file: business_dict = json.loads(line_json); business_id = business_dict["business_id"]; categories_list = business_dict["categories"]; for category in categories_list: if category not in business_by_category: business_by_category[category] = set(); categories_business_counts[category] += 1 categories_reviews_counts[category] += business_dict["review_count"]; business_by_category[category].add(business_id); data.save(business_by_category, 'business_by_category.pkl.gz'); data.save(categories_business_counts, 'categories_business_counts.pkl.gz'); data.save(categories_reviews_counts, 'categories_reviews_counts.pkl.gz');
trainer.module.params[:] = bestweights break epochs += 1 if len(validationErrors) >= continueEpochs * 2: # have the validation errors started going up again? # compare the average of the last few to the previous few old = validationErrors[-continueEpochs * 2:-continueEpochs] new = validationErrors[-continueEpochs:] if min(new) > max(old): trainer.module.params[:] = bestweights break elif reduce(lambda x, y: x + (y - round(new[-1], convergence_threshold)), [round(y, convergence_threshold) for y in new]) == 0: trainer.module.params[:] = bestweights break print('> Test on holdout set') print(trainer.testOnData(testData)) # hit this command if you want to save the weights: data.save(bestweights, bestweights_filename) predict = np.array([np.argmax(net.activate(x)) for x, _ in testData]) realerr = float(sum(np.equal(predict, Y_test)))/len(predict) print('> Error on test set %f' % realerr) from evaluation import error_classification_matrix error_classification_matrix( Y_test, predict, )
def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000, dataset='mnist.pkl.gz', batch_size=20, n_in=28*28, n_out=10, n_hidden=500): """ Demonstrate stochastic gradient descent optimization for a multilayer perceptron This is demonstrated on MNIST. :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient :type L1_reg: float :param L1_reg: L1-norm's weight when added to the cost (see regularization) :type L2_reg: float :param L2_reg: L2-norm's weight when added to the cost (see regularization) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: the path of the MNIST dataset file from http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz """ datasets = load_data(dataset, cast=False) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size ###################### # BUILD ACTUAL MODEL # ###################### print '> Building model' # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.vector('y') # the labels are presented as 1D vector of # [int] labels rng = np.random.RandomState(1234) # construct the MLP class model = MLR(rng=rng, input=x, n_in=n_in, n_hidden=n_hidden, n_out=n_out) # the cost we minimize during training is the negative log likelihood of # the model plus the regularization terms (L1 and L2); cost is expressed # here symbolically cost = model.cost(y) \ + L1_reg * model.L1 \ + L2_reg * model.L2_sqr # compiling a Theano function that computes the mistakes that are made # by the model on a minibatch test_model = theano.function(inputs=[index], outputs=model.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size]}) test_prediction = theano.function(inputs=[index], outputs=model.prediction(), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size]}) validate_model = theano.function(inputs=[index], outputs=model.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size]}) # compute the gradient of cost with respect to theta (sotred in params) # the resulting gradients will be stored in a list gparams gparams = [] for param in model.params: gparam = T.grad(cost, param) gparams.append(gparam) # specify how to update the parameters of the model as a list of # (variable, update expression) pairs updates = [] # given two list the zip A = [a1, a2, a3, a4] and B = [b1, b2, b3, b4] of # same length, zip generates a list C of same size, where each element # is a pair formed from the two lists : # C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)] for param, gparam in zip(model.params, gparams): updates.append((param, param - learning_rate * gparam)) # compiling a Theano function `train_model` that returns the cost, but # in the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function(inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size]}) ############### # TRAIN MODEL # ############### print '> Training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = np.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = np.mean(validation_losses) print('- epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [test_model(i) for i in xrange(n_test_batches)] test_score = np.mean(test_losses) print(('- epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() test_result = [test_prediction(i) for i in xrange(n_test_batches)] test_result = np.concatenate(test_result, axis=0) test_target = test_set_y.eval() #[0: len(test_result)] data.save((test_result, test_target), 'result.pkl.gz') print(('> Optimization complete. Best validation score of %f %% ' 'obtained at iteration %i, with test performance %f %%') % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
print "> Create training set" X_train = np.concatenate(X_train, axis=0) Y_train = np.array(Y_train) assert(X_train.shape[0] == len(Y_train)) print "> Create holdout set" X_holdout = np.concatenate(X_holdout, axis=0) Y_holdout = np.array(Y_holdout) assert(X_holdout.shape[0] == len(Y_holdout)) permut = np.random.permutation(len(Y_holdout)) print "- Create valid set" X_valid = X_holdout[permut[0:len(permut)/2], :] Y_valid = Y_holdout[permut[0:len(permut)/2]] assert(X_valid.shape[0] == len(Y_valid)) print "- Create test subet" X_test = X_holdout[permut[len(permut)/2::], :] Y_test = Y_holdout[permut[len(permut)/2::]] assert(X_test.shape[0] == len(Y_test)) print "< Test sets created" data.save(( (X_train, Y_train), (X_valid, Y_valid), (X_test, Y_test), ), "data.pkl.gz" )
from utils import data f_data_train = "slda_data_train.txt"; f_data_test = "slda_data_test.txt"; def get_corpus(filename): open_file = open(filename); lines = open_file.readlines(); corpus = []; for line in lines: entries = line.split(); doc_in_corpus = []; # First entry is ignored as it corresponds to the # of words for entry in entries[1:]: doc_in_corpus.append( tuple( [int(x) for x in entry.split(':') ] ) ); corpus.append(doc_in_corpus); open_file.close(); return corpus; corpus_train = get_corpus( f_data_train ); corpus_test = get_corpus( f_data_test ); data.save(corpus_train, "lda_corpus_train.pkl.gz"); data.save(corpus_test, "lda_corpus_test.pkl.gz");
print("> Create training set") X_train = np.concatenate(X_train, axis=0) Y_train = np.array(Y_train) assert(X_train.shape[0] == len(Y_train)) print("> Create holdout set") X_holdout = np.concatenate(X_holdout, axis=0) Y_holdout = np.array(Y_holdout) assert(X_holdout.shape[0] == len(Y_holdout)) # permut = np.random.permutation(len(Y_holdout)) permut = range(len(Y_holdout)); print("- Create valid set") X_valid = X_holdout[permut[len(permut)/2::], :] Y_valid = Y_holdout[permut[len(permut)/2::]] assert(X_valid.shape[0] == len(Y_valid)) print "- Create test subet" X_test = X_holdout[permut[0:len(permut)/2], :] Y_test = Y_holdout[permut[0:len(permut)/2]] assert(X_test.shape[0] == len(Y_test)) print "< Test sets created" data.save(( (X_train, Y_train), (X_valid, Y_valid), (X_test, Y_test), ), training_filename )
data = np.array(data) indices = np.array(indices) indptr = np.array(indptr) target = np.array(target) # generating csr matrix csr_train = csr_matrix( (data, indices, indptr), shape = (idoc, max(indices)+1 ) ) return (csr_train,target) ########### # Process # ########### print('> Generating training matrix ') X_train, Y_train = generateScipyCSRMatrix(chosenGenerator, dataset_train_filename) data.save((X_train, Y_train), file_csr_train); print('> Generating test matrix ') X_test, Y_test = generateScipyCSRMatrix(chosenGenerator, dataset_test_filename) data.save((X_test, Y_test), file_csr_test); # fitting the linear regression model print('> Fitting the model ') # Actually performing the linear regression alpha_opt_word_freq = 70. lin_reg_model = linear_model.SGDRegressor( eta0=0.04, # starting learning rate n_iter=300, # max number of epochs shuffle=True, verbose=0, alpha=0.000000, # regularization constant );
def generate_slda_data(filename="yelp_academic_dataset_review_training.json", n_reviews=None, category=None): filepath = root + "/dataset/" + filename """ A couple of useful initializations """ # Chris Potts tokenizer. tok = tokenizer.Tokenizer(preserve_case=False) # min and max ngram sizes MIN_NGRAM = 1 MAX_NGRAM = 1 word_set = set() # set of unique words word2idx = dict() # mapping from word to int representation of a word ratings_list = [] reviews_list = [] reviews = [] data_list = [] words_distr = dict() words_counts = Counter() """ PHASE 1 : Load file and get set of all words """ stopwords = nltk.corpus.stopwords.words("english") print " PHASE 1 : Get all words " loaded_file = open(filepath) lines_file = loaded_file.readlines() if n_reviews == None: n_reviews = len(lines_file) loaded_file.close() i_review = 1 # we randomly select n_reviews from the dataset permutation = np.random.permutation(len(lines_file)) sample_reviews = permutation[0:n_reviews] for idx_review in sample_reviews: line_json = lines_file[idx_review] review_dict = json.loads(line_json) tokens_list = tok.ngrams(review_dict["text"], MIN_NGRAM, MAX_NGRAM, string=True) rating = review_dict["stars"] for token in tokens_list: if token not in stopwords: """ if token not in words_distr: words_distr[token] = Counter({5:0, 4:0, 3:0, 2:0, 1:0}); words_distr[token][rating] += 1; """ words_counts[token] += 1 reviews_list.append(Counter(tokens_list)) ratings_list.append(review_dict["stars"] - 1) word_set |= set(tokens_list) disp.tempPrint(str(i_review)) i_review += 1 """ PHASE 2 : Word to int conversion """ filter_threshold = 0.00001 * (max(words_counts.values()) * 1.0) print " PHASE 2 : Word to int conversion " i_word = 1 for word in word_set: if words_counts[word] >= filter_threshold: word2idx[word] = i_word disp.tempPrint(str(i_word)) i_word += 1 print " Filtered. Before : %d words. After : %d" % (len(word_set), len(word2idx)) """ PHASE 3 : Converting data to the right format """ print " PHASE 3 : Converting data to the right format " i_review = 1 for review in reviews_list: nwords = 0 data_line = "" for word in review: if word in word2idx: data_line += " " + str(word2idx[word]) + ":" + str(review[word]) nwords += 1 data_line += "\n" if nwords != 0: data_line = str(nwords) + " " + data_line data_list.append(data_line) disp.tempPrint(str(i_review)) i_review += 1 """ PHASE 4 : Save into right files """ print " PHASE 4 : Save into right files " n_reviews = len(data_list) idx_test = n_reviews * 8 / 10 if category: category = "_" + category else: cateogory = "" data_train = open("/tmp/slda_data_train" + category + ".txt", "w") label_train = open("/tmp/slda_label_train" + category + ".txt", "w") data_test = open("/tmp/slda_data_test" + category + ".txt", "w") label_test = open("/tmp/slda_label_test" + category + ".txt", "w") for i_review in range(idx_test): data_train.write(data_list[i_review]) label_train.write(str(ratings_list[i_review]) + "\n") for i_review in range(idx_test, n_reviews): data_test.write(data_list[i_review]) label_test.write(str(ratings_list[i_review]) + "\n") data_train.close() data_test.close() label_train.close() label_test.close() """ PHASE 5 : Save useful datastructures """ print " PHASE 5 : Save useful datastructures " data.save(reviews_list, "/tmp/slda_reviews" + category + ".pkl.gz") data.save(ratings_list, "/tmp/slda_ratings" + category + ".pkl.gz") data.save(word2idx, "/tmp/slda_word2idx" + category + ".pkl.gz")