def main(): print("Getting features for valid papers from the database") if(os.path.exists("features_valid.obj")): with open("features_valid.obj", 'r') as loadfile: data = cPickle.load(loadfile) else: data = data_io.get_features_db("ValidPaper") with open("features_valid.obj", 'w') as dumpfile: cPickle.dump(data, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) author_paper_ids = [x[:2] for x in data] features = [x[2:] for x in data] #code for including keywords match feature print "adding addtional features..." import additional_features as af all_features = af.get_additional_features() _, _, kw_features = all_features for i in range(len(features)): features[i]+= tuple(kw_features[i][2:]) featuresnp = np.array(features, dtype='int32') # featuresnp -= np.mean(featuresnp, axis=0) # featuresnp /= np.std(featuresnp, axis=0) print("Loading the classifier") classifier = data_io.load_model(prefix="forest_") print("Making predictions") predictions = classifier.predict_proba(featuresnp)[:,1] predictions = list(predictions) author_predictions = defaultdict(list) paper_predictions = {} for (a_id, p_id), pred in zip(author_paper_ids, predictions): author_predictions[a_id].append((pred, p_id)) for author_id in sorted(author_predictions): paper_ids_sorted = sorted(author_predictions[author_id], reverse=True) paper_predictions[author_id] = [x[1] for x in paper_ids_sorted] print("Writing predictions to file") data_io.write_submission(paper_predictions, prefix="forest_")
def main(): print("Getting features for deleted papers from the database") if(os.path.exists("features_deleted.obj")): with open("features_deleted.obj", 'r') as loadfile: features_deleted = cPickle.load(loadfile) else: features_deleted = data_io.get_features_db("TrainDeleted") with open("features_deleted.obj", 'w') as dumpfile: cPickle.dump(features_deleted, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) print("Getting features for confirmed papers from the database") if(os.path.exists("features_confirmed.obj")): with open("features_confirmed.obj", 'r') as loadfile: features_conf = cPickle.load(loadfile) else: features_conf = data_io.get_features_db("TrainConfirmed") with open("features_confirmed.obj", 'w') as dumpfile: cPickle.dump(features_conf, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) features = [x[2:] for x in features_deleted + features_conf] target = [0 for x in range(len(features_deleted))] + [1 for x in range(len(features_conf))] #code for including keywords match feature print "adding addtional features..." import additional_features as af all_features = af.get_additional_features() kw_deleted, kw_confirmed, _ = all_features kw_features = kw_deleted+kw_confirmed for i in range(len(features)): features[i]+= tuple(kw_features[i][2:]) #Simple K-Fold cross validation. 10 folds. #cv = cross_validation.KFold(len(features), n_folds=5) cv = cross_validation.ShuffleSplit(len(features), n_iter=4, test_size=0.4, random_state=0) print("Training the Classifier") classifier = RandomForestClassifier(n_estimators=100, verbose=2, n_jobs=1, min_samples_split=1, random_state=0, compute_importances=True ) featuresnp = np.array(features, dtype='int32') targetnp = np.array(target, dtype='int32') # with open("wrong_predictions.txt", 'w' ) as wp: # class1count = 0; class2count =0; rpredictions = 0 # for train, test in cv: # x_train = featuresnp[train]; y_train = targetnp[train] # x_test = featuresnp[test]; y_test = targetnp[test] # classifier.fit(x_train, y_train) # predictions = classifier.predict_proba(x_test) # pred_classes = classifier.predict(x_test) # for i in range(len(y_test)): # # if y_test[i] != pred_classes[i] : # if(predictions[i,0] > 0.5 and predictions[i,0] < 0.6): # class1count+=1; # if(predictions[i,1] > 0.5 and predictions[i,1] < 0.6): # class2count+=1; # line = "feat: "+str(features[test[i]])+" ".join([ " a:",str(y_test[i])," p:", str(pred_classes[i])," proba:", str(predictions[i]), "\n"]) # wp.write(line) # else: # if(predictions[i,0] > 0.4 and predictions[i,0] < 0.6): # rpredictions+=1; # # print "number of wrong predictions of deleted class: ", class1count # print "number of wrong predictions of confirmed class: ", class2count # print "number of right predictions with close probas", rpredictions # for train, test in cv: # print "total number of test examples: ", len(test) # classifier.fit(featuresnp, targetnp) # importances = classifier.feature_importances_ ## std = np.std([tree.feature_importances_ for tree in forest.estimators_], ## axis=0) # indices = np.argsort(importances)[::-1] # # # Print the feature ranking # print("Feature ranking:") # # for f in range(len(indices)): # print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) # # numFeatures = 15 # prunedFeatures = np.zeros(shape=(featuresnp.shape[0], numFeatures), dtype="int32") # for i in range(prunedFeatures.shape[0]): # for j, fi in enumerate(indices[0:numFeatures]): # prunedFeatures[i,j] = featuresnp[i, fi] # featuresnp -= np.mean(featuresnp, axis=0) # featuresnp /= np.std(featuresnp, axis=0) # results = cross_validation.cross_val_score(classifier, X=featuresnp, y=targetnp, cv=cv, n_jobs=4, verbose=True) #print out the mean of the cross-validated results print "Results: ", results print "Results: " + str( np.array(results).mean())
def main(): print("Getting features for deleted papers from the database") if (os.path.exists("features_deleted.obj")): with open("features_deleted.obj", 'r') as loadfile: features_deleted = cPickle.load(loadfile) else: features_deleted = data_io.get_features_db("TrainDeleted") with open("features_deleted.obj", 'w') as dumpfile: cPickle.dump(features_deleted, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) print("Getting features for confirmed papers from the database") if (os.path.exists("features_confirmed.obj")): with open("features_confirmed.obj", 'r') as loadfile: features_conf = cPickle.load(loadfile) else: features_conf = data_io.get_features_db("TrainConfirmed") with open("features_confirmed.obj", 'w') as dumpfile: cPickle.dump(features_conf, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) features = [x[2:] for x in features_deleted + features_conf] target = [0 for x in range(len(features_deleted)) ] + [1 for x in range(len(features_conf))] #code for including keywords match feature print "adding addtional features..." import additional_features as af all_features = af.get_additional_features() kw_deleted, kw_confirmed, _ = all_features kw_features = kw_deleted + kw_confirmed for i in range(len(features)): features[i] += tuple(kw_features[i][2:]) #Simple K-Fold cross validation. 10 folds. #cv = cross_validation.KFold(len(features), n_folds=5) cv = cross_validation.ShuffleSplit(len(features), n_iter=4, test_size=0.4, random_state=0) print("Training the Classifier") classifier = RandomForestClassifier(n_estimators=100, verbose=2, n_jobs=1, min_samples_split=1, random_state=0, compute_importances=True) featuresnp = np.array(features, dtype='int32') targetnp = np.array(target, dtype='int32') # with open("wrong_predictions.txt", 'w' ) as wp: # class1count = 0; class2count =0; rpredictions = 0 # for train, test in cv: # x_train = featuresnp[train]; y_train = targetnp[train] # x_test = featuresnp[test]; y_test = targetnp[test] # classifier.fit(x_train, y_train) # predictions = classifier.predict_proba(x_test) # pred_classes = classifier.predict(x_test) # for i in range(len(y_test)): # # if y_test[i] != pred_classes[i] : # if(predictions[i,0] > 0.5 and predictions[i,0] < 0.6): # class1count+=1; # if(predictions[i,1] > 0.5 and predictions[i,1] < 0.6): # class2count+=1; # line = "feat: "+str(features[test[i]])+" ".join([ " a:",str(y_test[i])," p:", str(pred_classes[i])," proba:", str(predictions[i]), "\n"]) # wp.write(line) # else: # if(predictions[i,0] > 0.4 and predictions[i,0] < 0.6): # rpredictions+=1; # # print "number of wrong predictions of deleted class: ", class1count # print "number of wrong predictions of confirmed class: ", class2count # print "number of right predictions with close probas", rpredictions # for train, test in cv: # print "total number of test examples: ", len(test) # classifier.fit(featuresnp, targetnp) # importances = classifier.feature_importances_ ## std = np.std([tree.feature_importances_ for tree in forest.estimators_], ## axis=0) # indices = np.argsort(importances)[::-1] # # # Print the feature ranking # print("Feature ranking:") # # for f in range(len(indices)): # print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) # # numFeatures = 15 # prunedFeatures = np.zeros(shape=(featuresnp.shape[0], numFeatures), dtype="int32") # for i in range(prunedFeatures.shape[0]): # for j, fi in enumerate(indices[0:numFeatures]): # prunedFeatures[i,j] = featuresnp[i, fi] # featuresnp -= np.mean(featuresnp, axis=0) # featuresnp /= np.std(featuresnp, axis=0) # results = cross_validation.cross_val_score(classifier, X=featuresnp, y=targetnp, cv=cv, n_jobs=4, verbose=True) #print out the mean of the cross-validated results print "Results: ", results print "Results: " + str(np.array(results).mean())
def test_mlp(learning_rate=0.017, L1_reg=0.0001, L2_reg=0.0003, n_epochs=10000, n_hidden=50): """ :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient :type L1_reg: float :param L1_reg: L1-norm's weight when added to the cost (see regularization) :type L2_reg: float :param L2_reg: L2-norm's weight when added to the cost (see regularization) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer """ np.random.seed(17) print("Getting features for deleted papers from the database") features_deleted = None features_conf = None if (os.path.exists("features_deleted.obj")): with open("features_deleted.obj", 'r') as loadfile: features_deleted = cPickle.load(loadfile) else: features_deleted = data_io.get_features_db("TrainDeleted") with open("features_deleted.obj", 'w') as dumpfile: cPickle.dump(features_deleted, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) print("Getting features for confirmed papers from the database") if (os.path.exists("features_confirmed.obj")): with open("features_confirmed.obj", 'r') as loadfile: features_conf = cPickle.load(loadfile) else: features_conf = data_io.get_features_db("TrainConfirmed") with open("features_confirmed.obj", 'w') as dumpfile: cPickle.dump(features_conf, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) # predictInts = [] # for tup in features_valid: # a, b, c, d, e = tup # predictInts.append((int(a), int(b), int(c), int(d), int(e))) # # predictsMat = np.ndarray(shape=(len(predictInts), 5), dtype='int32') # for i, tup in enumerate(predictInts): # a, b, c, d, e = tup # predictsMat[i, 0] = a; predictsMat[i, 1] = b; predictsMat[i, 2] = c; predictsMat[i, 3] = d; predictsMat[i, 4] = e; # predict_set_x = theano.shared(features_validnp, borrow=True) features = [x[2:] for x in features_deleted + features_conf] target = [0 for x in range(len(features_deleted)) ] + [1 for x in range(len(features_conf))] #code for including keywords match feature print "adding additional features..." import additional_features as af all_features = af.get_additional_features() kw_deleted, kw_confirmed, _ = all_features kw_features = kw_deleted + kw_confirmed for i in range(len(features)): features[i] += tuple(kw_features[i][2:]) featuresnp = np.array(features, dtype='float64') targetnp = np.array(target, dtype='int32') featuresnp -= np.mean(featuresnp, axis=0) featuresnp /= np.std(featuresnp, axis=0) cv = cross_validation.ShuffleSplit(len(features), n_iter=1, test_size=0.25, random_state=0) for train, test in cv: train_set_x = theano.shared(featuresnp[train], borrow=True) test_set_x = theano.shared(featuresnp[test], borrow=True) train_set_y = theano.shared(targetnp[train], borrow=True) test_set_y = theano.shared(targetnp[test], borrow=True) batch_size = 20 # size of the minibatch # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size # n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # allocate symbolic variables for the data # size = T.lscalar() index = T.lscalar() x = T.matrix( 'x', dtype='float64' ) # sparse.csr_matrix('x', dtype='int32'); the data is presented as sparse matrix y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels rng = np.random.RandomState(113) # construct the MLP class classifier = MLP(rng=rng, input=x, n_in=featuresnp.shape[1], n_hidden=n_hidden, n_out=2) cost = classifier.negative_log_likelihood(y) \ + L1_reg * classifier.L1 \ + L2_reg * classifier.L2_sqr test_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) # compute the gradient of cost with respect to theta (sotred in params) # the resulting gradients will be stored in a list gparams gparams = [] for param in classifier.params: gparam = T.grad(cost, param) gparams.append(gparam) # specify how to update the parameters of the model as a dictionary updates = OrderedDict() # given two list the zip A = [a1, a2, a3, a4] and B = [b1, b2, b3, b4] of # same length, zip generates a list C of same size, where each element # is a pair formed from the two lists : # C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)] for param, gparam in zip(classifier.params, gparams): updates[param] = param - learning_rate * gparam train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 1000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.0995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = np.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False best_params = None while True: try: epoch = epoch + 1 training_cost = [] for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) training_cost.append(minibatch_avg_cost) # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ test_model(i) for i in xrange(n_test_batches) ] this_validation_loss = np.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss best_iter = iter best_params = classifier.params mean_cost = np.mean(training_cost) print "Epoch ", epoch, " training cost: ", mean_cost except KeyboardInterrupt: print "Training ended by user.\n" # #update params one last time in case we interrupted the training in middle of updates # for minibatch_index in xrange(n_train_batches): # train_model(minibatch_index) print "Best Validation loss:", best_validation_loss break end_time = time.clock() print(('Optimization complete. Best validation score of %f %% ' 'obtained at iteration %i, with test performance %f %%') % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) print("Saving the mlp best params") data_io.save_model(best_params, prefix="theano_")
def test_mlp(learning_rate=0.017, L1_reg=0.0001, L2_reg=0.0003, n_epochs=10000, n_hidden=50): """ :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient :type L1_reg: float :param L1_reg: L1-norm's weight when added to the cost (see regularization) :type L2_reg: float :param L2_reg: L2-norm's weight when added to the cost (see regularization) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer """ np.random.seed(17) print("Getting features for deleted papers from the database") features_deleted = None; features_conf = None if(os.path.exists("features_deleted.obj")): with open("features_deleted.obj", 'r') as loadfile: features_deleted = cPickle.load(loadfile) else: features_deleted = data_io.get_features_db("TrainDeleted") with open("features_deleted.obj", 'w') as dumpfile: cPickle.dump(features_deleted, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) print("Getting features for confirmed papers from the database") if(os.path.exists("features_confirmed.obj")): with open("features_confirmed.obj", 'r') as loadfile: features_conf = cPickle.load(loadfile) else: features_conf = data_io.get_features_db("TrainConfirmed") with open("features_confirmed.obj", 'w') as dumpfile: cPickle.dump(features_conf, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) # predictInts = [] # for tup in features_valid: # a, b, c, d, e = tup # predictInts.append((int(a), int(b), int(c), int(d), int(e))) # # predictsMat = np.ndarray(shape=(len(predictInts), 5), dtype='int32') # for i, tup in enumerate(predictInts): # a, b, c, d, e = tup # predictsMat[i, 0] = a; predictsMat[i, 1] = b; predictsMat[i, 2] = c; predictsMat[i, 3] = d; predictsMat[i, 4] = e; # predict_set_x = theano.shared(features_validnp, borrow=True) features = [x[2:] for x in features_deleted + features_conf] target = [0 for x in range(len(features_deleted))] + [1 for x in range(len(features_conf))] #code for including keywords match feature print "adding additional features..." import additional_features as af all_features = af.get_additional_features() kw_deleted, kw_confirmed, _ = all_features kw_features = kw_deleted+kw_confirmed for i in range(len(features)): features[i]+= tuple(kw_features[i][2:]) featuresnp = np.array(features, dtype='float64') targetnp = np.array(target, dtype='int32') featuresnp -=np.mean(featuresnp, axis=0) featuresnp /=np.std(featuresnp, axis=0) cv = cross_validation.ShuffleSplit(len(features), n_iter=1, test_size=0.25, random_state=0) for train, test in cv: train_set_x = theano.shared(featuresnp[train], borrow=True) test_set_x = theano.shared(featuresnp[test], borrow=True) train_set_y = theano.shared(targetnp[train], borrow=True) test_set_y=theano.shared(targetnp[test], borrow=True) batch_size = 20 # size of the minibatch # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size # n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # allocate symbolic variables for the data # size = T.lscalar() index = T.lscalar() x = T.matrix('x', dtype='float64') # sparse.csr_matrix('x', dtype='int32'); the data is presented as sparse matrix y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels rng = np.random.RandomState(113) # construct the MLP class classifier = MLP(rng=rng, input=x, n_in=featuresnp.shape[1], n_hidden=n_hidden, n_out=2) cost = classifier.negative_log_likelihood(y) \ + L1_reg * classifier.L1 \ + L2_reg * classifier.L2_sqr test_model = theano.function(inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size]}) # compute the gradient of cost with respect to theta (sotred in params) # the resulting gradients will be stored in a list gparams gparams = [] for param in classifier.params: gparam = T.grad(cost, param) gparams.append(gparam) # specify how to update the parameters of the model as a dictionary updates = OrderedDict() # given two list the zip A = [a1, a2, a3, a4] and B = [b1, b2, b3, b4] of # same length, zip generates a list C of same size, where each element # is a pair formed from the two lists : # C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)] for param, gparam in zip(classifier.params, gparams): updates[param] = param - learning_rate * gparam train_model = theano.function(inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size]}) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 1000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.0995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = np.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False best_params = None while True: try : epoch = epoch + 1 training_cost = [] for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) training_cost.append(minibatch_avg_cost) # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [test_model(i) for i in xrange(n_test_batches)] this_validation_loss = np.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss best_iter = iter best_params = classifier.params mean_cost = np.mean(training_cost) print "Epoch ", epoch," training cost: ", mean_cost except KeyboardInterrupt: print "Training ended by user.\n" # #update params one last time in case we interrupted the training in middle of updates # for minibatch_index in xrange(n_train_batches): # train_model(minibatch_index) print "Best Validation loss:", best_validation_loss break end_time = time.clock() print(('Optimization complete. Best validation score of %f %% ' 'obtained at iteration %i, with test performance %f %%') % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) print("Saving the mlp best params") data_io.save_model(best_params, prefix="theano_")