def train(input): x = input['x'] y = input['y'] a = input['learning_rate'] iters = input['iters'] results = log_reg_sgd(x,y,a,max_iter=iters,debug=False) print 'Done, ', results return results
def first_test(): from ml_metrics import auc import random from sklearn import datasets b = BasicLogisticRegression(4) iris = datasets.load_iris() train_data = iris.data[:75] train_y = iris.target[:75] test_x = iris.data[75:100] tmp = iris.target[:100] random.shuffle(tmp) test_y = tmp[:50] def to_dict(x): return {i: k for i, k in enumerate(x, start=1)} for z in xrange(50): for x, y in random.shuffle(zip(train_data, train_y)): # print x, y b.sgd_fit_one(to_dict(x), y) print "fit done" rst_y = map(b.predict_raw, map(to_dict, test_x)) print b.weights print test_y print rst_y print auc(test_y, rst_y) # print len(iris.data) # # another implementation from sgd import log_reg_sgd, h theta, err = log_reg_sgd(train_data, train_y, 0.001, max_iter=100) pred = [h(i, theta) for i in test_x] print "theta,", theta print auc(test_y, pred)
def test_sgd(): #number of examples EX = 500 #learning rate a = 0.001 #create a synthetic data set x,y = datasets.make_classification(EX) #append a 1 column at index 0 in x x = np.hstack((np.ones((x.shape[0],1)),x)) #train on half the data theta,err = log_reg_sgd(x[:EX/2],y[:EX/2],a,max_iter=100) #plot the error plt.plot(err, linewidth=2) plt.xlabel('Training example', fontsize=20) plt.ylabel('Error', fontsize=20) plt.show() #predict the test set pred = [h(x[i],theta) for i in xrange(EX/2,EX)] #plot the error as a function of training examples fpr, tpr, thresholds = metrics.roc_curve(y[EX/2:], pred) #plot the ROC curve plt.plot(fpr,tpr, linewidth=2) plt.xlabel('False positive rate', fontsize=20) plt.ylabel('True positive rate', fontsize=20) plt.show() #measure the performance using ROC and AUC auc = metrics.auc(fpr, tpr) print 'AUC of classifier: ', auc
EX = 500 # learning rate a = 0.001 max_iter = 10 # create a synthetic data set x, y = datasets.make_classification(EX) print "sample", x[251] print "feature num ", x.shape[1] # append a 1 column at index 0 in x x = np.hstack((np.ones((x.shape[0], 1)), x)) print x[251] from sgd import log_reg_sgd, h theta = log_reg_sgd(x[:EX / 2], y[:EX / 2], a, max_iter=max_iter) pred = [h(x[i], theta) for i in xrange(EX / 2, EX)] print "weights ",theta # print "err ",err print auc(y[EX / 2:], pred) def to_dict(x): # print x return {i: k for i, k in enumerate(x[1:], start=1)} b = BasicLogisticRegression(x.shape[1]-1, a) for z in xrange(max_iter ): for i in xrange(EX / 2): b.sgd_fit_one(to_dict(x[i]), y[i])
def test_parallel_sgd(): #learning rate a = 0.001 #create a synthetic data set, default features, 1500 examples, 2 classes x,y = datasets.make_classification(1500) #append a 1 column at index 0 in x x = np.hstack((np.ones((x.shape[0],1)),x)) ### PARALLEL VERSION ### #worker pool pool = Pool(4) input = [{'x':x[:250],'y':y[:250],'learning_rate':a,'iters':500}, {'x':x[250:500],'y':y[250:500],'learning_rate':a,'iters':500}, {'x':x[500:750],'y':y[500:750],'learning_rate':a,'iters':500}, {'x':x[750:1000],'y':y[750:1000],'learning_rate':a,'iters':500}] thetas = pool.map(train, input) #compute the average theta = np.mean(thetas,axis=0) #we take the average prediction b_pred = [h(x[i],theta) for i in xrange(1000,1500)] #plot the error as a function of training examples b_fpr, b_tpr, thresholds = metrics.roc_curve(y[1000:], b_pred) #plot the ROC curve plt.plot(b_fpr,b_tpr, 'r-', label='Bagged', linewidth=2) #measure the performance using ROC and AUC b_auc = metrics.auc(b_fpr, b_tpr) print 'AUC of parallel classifier: ', b_auc ### ### SEQUENTIAL VERSION ### #train on half the data theta = log_reg_sgd(x[:1000],y[:1000],a,max_iter=500,debug=False) #predict the test set pred = [h(x[i],theta) for i in xrange(1000,1500)] #plot the error as a function of training examples fpr, tpr, thresholds = metrics.roc_curve(y[1000:], pred) #plot the ROC curve plt.plot(fpr,tpr, 'b-', label='Non-bagged', linewidth=2) plt.xlabel('False positive rate', fontsize=20) plt.ylabel('True positive rate', fontsize=20) plt.legend(loc=0) plt.show() #measure the performance using ROC and AUC auc = metrics.auc(fpr, tpr) print 'AUC of sequential classifier: ', auc #write results to file for later with open('bag_results.tsv','w') as out: for i,j in zip(b_fpr,b_tpr): out.write("\t".join((str(i),str(j)))+"\n") #write results to file for later with open('results.tsv','w') as out: for i,j in zip(fpr,tpr): out.write("\t".join((str(i),str(j)))+"\n")