Ejemplo n.º 1
0
def plotCostHistory (X, y, standardization=False, addPolyFeats=False, degree=5, 
                     iterations=400000, learningRate=1e-3, regularization=0.0):
    
    lr = r.LogisticRegression(X, y, standardization=standardization, 
                              addPolyFeats=addPolyFeats, degree=degree)
    
    costHistory = lr.gradientDescent(iterations=iterations, 
                                     learningRate=learningRate, 
                                     regularization=regularization)
    
    # trained parameters without polynomial features:
    
    # non standardized for 3.000.000 iterations:
    # lr.theta = np.array([-21.06746245, 0.17350979, 0.16833432])
    
    # standardized for 400.000 iterations:
    # lr.theta = np.array([1.65840542, 3.86476728, 3.60126676])
    
    plt.figure()
    ax = plt.gca()
    plt.subplots_adjust(top=0.98, bottom=0.13, left=0.13, right=0.98)
    plt.rcParams.update({"font.size": 18})
    plt.plot(np.arange(1, iterations+1), costHistory, color="black")
    plt.text(0.4, 0.9, "learning rate = " + str(learningRate), 
             transform=ax.transAxes)
    plt.text(0.4, 0.8, "regularization = " + str(regularization), 
             transform=ax.transAxes)
    plt.xlabel("number of iterations")
    plt.ylabel(r"cost function $J$")
    plt.ticklabel_format(axis="x", style="sci", scilimits=(0, 0))
    plt.grid(color="lightgray")
    plt.show()
Ejemplo n.º 2
0
def plotCostHistory(X,
                    y,
                    standardization=False,
                    addPolyFeats=False,
                    degree=5,
                    iterations=400000,
                    learningRate=0.1,
                    regularization=0.0):

    lr = r.LogisticRegression(X,
                              y,
                              standardization=standardization,
                              addPolyFeats=addPolyFeats,
                              degree=degree)

    costHistory = lr.gradientDescent(iterations=iterations,
                                     learningRate=learningRate,
                                     regularization=regularization)

    plt.figure()
    ax = plt.gca()
    plt.subplots_adjust(top=0.98, bottom=0.13, left=0.14, right=0.98)
    plt.rcParams.update({"font.size": 18})
    plt.plot(np.arange(1, iterations + 1), costHistory, color="black")
    plt.text(0.4,
             0.9,
             "learning rate = " + str(learningRate),
             transform=ax.transAxes)
    plt.text(0.4,
             0.8,
             "regularization = " + str(regularization),
             transform=ax.transAxes)
    plt.xlabel("number of iterations")
    plt.ylabel(r"cost function $J$")
    plt.ticklabel_format(axis="x", style="sci", scilimits=(0, 0))
    plt.grid(color="lightgray")
    plt.show()
Ejemplo n.º 3
0
def plotData(X,
             y,
             standardization=False,
             addPolyFeats=False,
             degree=5,
             iterations=400000,
             learningRate=0.1,
             regularization=0.0):

    Xfailed = X[y == False]
    Xpassed = X[y == True]

    lr = r.LogisticRegression(X,
                              y,
                              standardization=standardization,
                              addPolyFeats=addPolyFeats,
                              degree=degree)

    lr.gradientDescent(iterations=iterations,
                       learningRate=learningRate,
                       regularization=regularization)

    lower = -1.2
    upper = +1.2
    step = 0.01
    test1 = np.arange(lower, upper + 1e-2, step)
    test2 = np.arange(lower, upper + 1e-2, step)
    extent = np.array([lower, upper + 1e-2, lower, upper + 1e-2])
    scores = np.array(np.meshgrid(test1, test2)).T.reshape(-1, 2)

    if addPolyFeats == True:
        scores = lr.addPolynomialFeatures(scores, degree=degree)

    prediction = lr.predict(scores).reshape(241, 241)
    boundaryBool = np.logical_and(prediction > 0.48, prediction < 0.52)
    boundary = np.ma.masked_where(boundaryBool == True, prediction)

    plt.figure()
    plt.subplots_adjust(top=0.98, bottom=0.14, left=0.08, right=0.97)
    plt.rcParams.update({"font.size": 18})
    plt.scatter(Xpassed[:, 0],
                Xpassed[:, 1],
                color="darkblue",
                marker="x",
                label="passed",
                zorder=2)
    plt.scatter(Xfailed[:, 0],
                Xfailed[:, 1],
                color="red",
                marker="x",
                label="failed",
                zorder=2)
    plt.scatter(2, 2, color="white", marker="o", label="boundary")
    admission = plt.imshow(boundary,
                           extent=extent,
                           origin="lower",
                           cmap="coolwarm_r",
                           vmin=0.0,
                           vmax=1.0)
    cb = plt.colorbar(admission)
    cb.set_label("probability to pass the test", fontsize=18)
    plt.xlim(-1.2, 1.2)
    plt.ylim(-1.2, 1.2)
    plt.xlabel("microchip test 1")
    plt.ylabel("microchip test 2")
    plt.legend(loc="lower left", fontsize=15)
    plt.grid(color="darkgray")
    plt.show()
Ejemplo n.º 4
0
def plotData (X, y, standardization=False, addPolyFeats=False, degree=5, 
              iterations=400000, learningRate=1e-3, regularization=0.0):
    
    Xfailed = X[y == False]
    Xpassed = X[y == True]
    
    lr = r.LogisticRegression(X, y, standardization=standardization, 
                              addPolyFeats=addPolyFeats, degree=degree)
    
    lr.gradientDescent(iterations=iterations, learningRate=learningRate, 
                       regularization=regularization)
    
    # trained parameters without polynomial features:
    
    # non standardized for 3.000.000 iterations:
    # lr.theta = np.array([-21.06746245, 0.17350979, 0.16833432])
    
    # standardized for 400.000 iterations:
    # lr.theta = np.array([1.65840542, 3.86476728, 3.60126676])
    
    lower  = 0
    upper  = 101
    step   = 0.5
    exam1  = np.arange(lower, upper, step)
    exam2  = np.arange(lower, upper, step)
    extent = np.array([lower, upper + 1e-2, lower, upper + 1e-2])
    scores = np.array(np.meshgrid(exam1, exam2)).T.reshape(-1, 2)
    
    if addPolyFeats == True:
        scores = lr.addPolynomialFeatures(scores, degree=degree)
    
    prediction   = lr.predict(scores).reshape(202, 202)
    boundaryBool = np.logical_and(prediction > 0.47, prediction < 0.53)
    boundary     = np.ma.masked_where(boundaryBool == True, prediction)
    
    if addPolyFeats == False:
        
        x_bounds = np.array([np.min(X[:, 0]), np.max(X[:, 0])])
        
        if standardization == True:
            y_bounds = -(lr.theta[1] * (x_bounds - lr.mu) / lr.sigma \
                         + lr.theta[0]) / lr.theta[2]
            y_bounds = y_bounds * lr.sigma + lr.mu
        
        else:
            y_bounds = -(lr.theta[1] * x_bounds + lr.theta[0]) / lr.theta[2]
    
    plt.figure()
    plt.subplots_adjust(top=0.98, bottom=0.14, left=0.05, right=0.97)
    plt.rcParams.update({"font.size": 18})
    passed = plt.scatter(Xpassed[:, 0], Xpassed[:, 1], color="darkblue", 
                         marker="x", zorder=2)
    failed = plt.scatter(Xfailed[:, 0], Xfailed[:, 1], color="red", 
                         marker="x", zorder=2)
    
    if addPolyFeats == False:
        bounds,   = plt.plot(x_bounds, y_bounds, color="black")
        admission = plt.imshow(prediction, extent=extent, origin="lower", 
                               cmap="coolwarm_r", vmin=0.0, vmax=1.0)
    
    else:
        bounds    = plt.scatter(-1, -1, marker="o", color="white")
        admission = plt.imshow(boundary, extent=extent, origin="lower", 
                               cmap="coolwarm_r", vmin=0.0, vmax=1.0)
    
    cb = plt.colorbar(admission)
    cb.set_label("admission probability", fontsize=18)
    plt.xlim(0, 100)
    plt.ylim(0, 100)
    plt.xlabel("exam 1 score")
    plt.ylabel("exam 2 score")
    plt.legend([passed, failed, bounds], 
               ["passed", "failed", "decision boundary"], loc="lower left")
    plt.grid(color="lightgray")
    plt.show()
Ejemplo n.º 5
0
    # use BagOfWords with TFIDF normalizationn
    vectorizer = features.TfidfVectorizer(min_df=1)
    # put features into a pandas dataframe and fill 0 entries
    X = vectorizer.fit_transform(corpus)
    # set y to targets nodes
    y = [d['id'] for d in dataset]

    # split test/train
    X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
        X, y, test_size=0.3, random_state=42)
    assert set(y_train) == set(
        y_test
    ), 'Not all labels are in both test and train. Try different random seed'

    # load regression
    clf = regression.LogisticRegression()
    print 'running regression...'
    clf.fit(X_train, y_train)

    print "Training Score = %.4f" % (clf.score(X_train, y_train))
    print "Test Score = %.4f" % (clf.score(X_test, y_test))

    # generate confusion matrix
    y_pred = clf.predict(X_test)
    cm = sklearn.metrics.confusion_matrix(y_test, y_pred)

    # normalize along the row
    row_sums = cm.sum(axis=1)
    cm_normalized = 1.0 * cm / row_sums[:, np.newaxis]

    # plot confusion matrix
Ejemplo n.º 6
0
def learn_cross_domain(master_source,
                       external_sources,
                       corpus,
                       y,
                       max_examples=1000):
    # Create feature extractor for master_source
    master_vectorizer = features.TfidfVectorizer(min_df=1)

    # Create feature matrix for master_source
    X = {}
    X[master_source] = master_vectorizer.fit_transform(corpus[master_source])

    vectorizer = features.TfidfVectorizer(
        min_df=1, vocabulary=master_vectorizer.vocabulary_.keys())
    #print master_vectorizer.vocabulary_.keys()

    X[master_source] = vectorizer.fit_transform(corpus[master_source])

    # split test/train for master_source
    random_state = 42
    while True:
        X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
            X[master_source],
            y[master_source],
            test_size=0.3,
            random_state=random_state)
        if set(y_train).issuperset(set(y_test)):
            break
        else:
            random_state += 1
            print 'Training labels are not a superset of the testing labels. Trying different random seed: %d' % random_state

    # Create regression and train on master_source training set
    clf = regression.LogisticRegression()
    print 'running regression...'
    clf.fit(X_train, y_train)

    # Report test and train for master_source
    print "Training score for %s on %s = %.4f" % (master_source, master_source,
                                                  clf.score(X_train, y_train))
    print "Test score for %s on %s = %.4f" % (master_source, master_source,
                                              clf.score(X_test, y_test))

    # Deal with external sources
    for source in external_sources:
        # create a vectorizer that filters on the vocabulary of the vectorizer for the master source
        # *** If you do not do this - the fitter won't be able to predict words that it has not scored

        #vectorizer = features.TfidfVectorizer(min_df=1, vocabulary = master_vectorizer.vocabulary_.keys())
        #print master_vectorizer.vocabulary_.keys()

        # create feature matrix for external source

        #X[source] = master_vectorizer.fit_transform(corpus[source])
        X[source] = vectorizer.fit_transform(corpus[source])

        # Report the scores
        print "%s trained on %s score = %.4f" % (
            source, master_source, clf.score(X[source], y[source]))

    #import pdb; pdb.set_trace()
    # plotting
    with PdfPages('plots_trained_on_%s.pdf' % master_source) as pp:
        for fignum, external_source in enumerate([master_source] +
                                                 external_sources):
            # generate confusion matrix
            y_pred = clf.predict(X[external_source])
            y_true = y[external_source]
            generate_confusion_matrix_plot(master_source, external_source,
                                           y_pred, y_true, pp, fignum)