def first_test():
    from ml_metrics import auc
    import random
    from sklearn import datasets

    b = BasicLogisticRegression(4)

    iris = datasets.load_iris()
    train_data = iris.data[:75]
    train_y = iris.target[:75]

    test_x = iris.data[75:100]
    tmp = iris.target[:100]
    random.shuffle(tmp)
    test_y = tmp[:50]

    def to_dict(x):
        return {i: k for i, k in enumerate(x, start=1)}

    for z in xrange(50):
        for x, y in random.shuffle(zip(train_data, train_y)):
            # print x, y
            b.sgd_fit_one(to_dict(x), y)
    print "fit done"

    rst_y = map(b.predict_raw, map(to_dict, test_x))
    print b.weights
    print test_y
    print rst_y
    print auc(test_y, rst_y)
    # print len(iris.data)
    #



    # another implementation
    from sgd import log_reg_sgd, h

    theta, err = log_reg_sgd(train_data, train_y, 0.001, max_iter=100)
    pred = [h(i, theta) for i in test_x]
    print "theta,", theta
    print auc(test_y, pred)
Exemple #2
0
def test_sgd():
    
    #number of examples
    EX = 500
    
    #learning rate
    a = 0.001
    
    #create a synthetic data set
    x,y = datasets.make_classification(EX)
    
    #append a 1 column at index 0 in x
    x = np.hstack((np.ones((x.shape[0],1)),x))
    
    #train on half the data
    theta,err = log_reg_sgd(x[:EX/2],y[:EX/2],a,max_iter=100)
    
    #plot the error
    plt.plot(err, linewidth=2)
    plt.xlabel('Training example', fontsize=20)
    plt.ylabel('Error', fontsize=20)
    plt.show()
    
    #predict the test set
    pred = [h(x[i],theta) for i in xrange(EX/2,EX)]
    
    #plot the error as a function of training examples
    fpr, tpr, thresholds = metrics.roc_curve(y[EX/2:], pred)
    
    #plot the ROC curve
    plt.plot(fpr,tpr, linewidth=2)
    plt.xlabel('False positive rate', fontsize=20)
    plt.ylabel('True positive rate', fontsize=20)
    plt.show()
    
    #measure the performance using ROC and AUC
    auc = metrics.auc(fpr, tpr)
    
    print 'AUC of classifier: ', auc
Exemple #3
0
def test_sgd():
    
    #number of examples
    EX = 500
    
    #learning rate
    a = 0.001
    
    #create a synthetic data set
    x,y = datasets.make_classification(EX)
    
    #append a 1 column at index 0 in x
    x = np.hstack((np.ones((x.shape[0],1)),x))
    
    #train on half the data
    theta,err = log_reg_sgd(x[:EX/2],y[:EX/2],a,max_iter=100)
    
    #plot the error
    plt.plot(err, linewidth=2)
    plt.xlabel('Training example', fontsize=20)
    plt.ylabel('Error', fontsize=20)
    plt.show()
    
    #predict the test set
    pred = [h(x[i],theta) for i in xrange(EX/2,EX)]
    
    #plot the error as a function of training examples
    fpr, tpr, thresholds = metrics.roc_curve(y[EX/2:], pred)
    
    #plot the ROC curve
    plt.plot(fpr,tpr, linewidth=2)
    plt.xlabel('False positive rate', fontsize=20)
    plt.ylabel('True positive rate', fontsize=20)
    plt.show()
    
    #measure the performance using ROC and AUC
    auc = metrics.auc(fpr, tpr)
    
    print 'AUC of classifier: ', auc
    # learning rate
    a = 0.001
    max_iter = 10
    # create a synthetic data set
    x, y = datasets.make_classification(EX)
    print "sample", x[251]
    print "feature num ", x.shape[1]
    # append a 1 column at index 0 in x
    x = np.hstack((np.ones((x.shape[0], 1)), x))
    print x[251]
    from sgd import log_reg_sgd, h


    theta = log_reg_sgd(x[:EX / 2], y[:EX / 2], a, max_iter=max_iter)
    pred = [h(x[i], theta) for i in xrange(EX / 2, EX)]
    print "weights ",theta
    # print "err ",err
    print auc(y[EX / 2:], pred)


    def to_dict(x):
        # print x
        return {i: k for i, k in enumerate(x[1:], start=1)}


    b = BasicLogisticRegression(x.shape[1]-1, a)
    for z in xrange(max_iter ):
        for i in xrange(EX / 2):
            b.sgd_fit_one(to_dict(x[i]), y[i])
Exemple #5
0
def test_parallel_sgd():
    #learning rate
    a = 0.001
    
    #create a synthetic data set, default features, 1500 examples, 2 classes
    x,y = datasets.make_classification(1500)
    
    #append a 1 column at index 0 in x
    x = np.hstack((np.ones((x.shape[0],1)),x))
    
    ### PARALLEL VERSION ###
    #worker pool
    pool = Pool(4)
    
    input = [{'x':x[:250],'y':y[:250],'learning_rate':a,'iters':500},
             {'x':x[250:500],'y':y[250:500],'learning_rate':a,'iters':500},
             {'x':x[500:750],'y':y[500:750],'learning_rate':a,'iters':500},
             {'x':x[750:1000],'y':y[750:1000],'learning_rate':a,'iters':500}]
    
    thetas = pool.map(train, input)
    
    #compute the average
    theta = np.mean(thetas,axis=0)
    
    #we take the average prediction
    b_pred = [h(x[i],theta) for i in xrange(1000,1500)]
    
    #plot the error as a function of training examples
    b_fpr, b_tpr, thresholds = metrics.roc_curve(y[1000:], b_pred)
    
    #plot the ROC curve
    plt.plot(b_fpr,b_tpr, 'r-', label='Bagged', linewidth=2)
    
    
    #measure the performance using ROC and AUC
    b_auc = metrics.auc(b_fpr, b_tpr)
    
    print 'AUC of parallel classifier: ', b_auc
    ###
        
    ### SEQUENTIAL VERSION ###
    #train on half the data
    theta = log_reg_sgd(x[:1000],y[:1000],a,max_iter=500,debug=False)
    
    #predict the test set
    pred = [h(x[i],theta) for i in xrange(1000,1500)]
    
    #plot the error as a function of training examples
    fpr, tpr, thresholds = metrics.roc_curve(y[1000:], pred)
    
    #plot the ROC curve
    plt.plot(fpr,tpr, 'b-', label='Non-bagged', linewidth=2)
    plt.xlabel('False positive rate', fontsize=20)
    plt.ylabel('True positive rate', fontsize=20)
    plt.legend(loc=0)
    plt.show()
    
    #measure the performance using ROC and AUC
    auc = metrics.auc(fpr, tpr)
    
    print 'AUC of sequential classifier: ', auc
    
    #write results to file for later
    with open('bag_results.tsv','w') as out:
        for i,j in zip(b_fpr,b_tpr):
            out.write("\t".join((str(i),str(j)))+"\n")
    
    
    #write results to file for later
    with open('results.tsv','w') as out:
        for i,j in zip(fpr,tpr):
            out.write("\t".join((str(i),str(j)))+"\n")
Exemple #6
0
def test_parallel_sgd():
    #learning rate
    a = 0.001
    
    #create a synthetic data set, default features, 1500 examples, 2 classes
    x,y = datasets.make_classification(1500)
    
    #append a 1 column at index 0 in x
    x = np.hstack((np.ones((x.shape[0],1)),x))
    
    ### PARALLEL VERSION ###
    #worker pool
    pool = Pool(4)
    
    input = [{'x':x[:250],'y':y[:250],'learning_rate':a,'iters':500},
             {'x':x[250:500],'y':y[250:500],'learning_rate':a,'iters':500},
             {'x':x[500:750],'y':y[500:750],'learning_rate':a,'iters':500},
             {'x':x[750:1000],'y':y[750:1000],'learning_rate':a,'iters':500}]
    
    thetas = pool.map(train, input)
    
    #compute the average
    theta = np.mean(thetas,axis=0)
    
    #we take the average prediction
    b_pred = [h(x[i],theta) for i in xrange(1000,1500)]
    
    #plot the error as a function of training examples
    b_fpr, b_tpr, thresholds = metrics.roc_curve(y[1000:], b_pred)
    
    #plot the ROC curve
    plt.plot(b_fpr,b_tpr, 'r-', label='Bagged', linewidth=2)
    
    
    #measure the performance using ROC and AUC
    b_auc = metrics.auc(b_fpr, b_tpr)
    
    print 'AUC of parallel classifier: ', b_auc
    ###
        
    ### SEQUENTIAL VERSION ###
    #train on half the data
    theta = log_reg_sgd(x[:1000],y[:1000],a,max_iter=500,debug=False)
    
    #predict the test set
    pred = [h(x[i],theta) for i in xrange(1000,1500)]
    
    #plot the error as a function of training examples
    fpr, tpr, thresholds = metrics.roc_curve(y[1000:], pred)
    
    #plot the ROC curve
    plt.plot(fpr,tpr, 'b-', label='Non-bagged', linewidth=2)
    plt.xlabel('False positive rate', fontsize=20)
    plt.ylabel('True positive rate', fontsize=20)
    plt.legend(loc=0)
    plt.show()
    
    #measure the performance using ROC and AUC
    auc = metrics.auc(fpr, tpr)
    
    print 'AUC of sequential classifier: ', auc
    
    #write results to file for later
    with open('bag_results.tsv','w') as out:
        for i,j in zip(b_fpr,b_tpr):
            out.write("\t".join((str(i),str(j)))+"\n")
    
    
    #write results to file for later
    with open('results.tsv','w') as out:
        for i,j in zip(fpr,tpr):
            out.write("\t".join((str(i),str(j)))+"\n")