コード例 #1
0
ファイル: TitanicLogReg.py プロジェクト: mwbinger/Titanic
def titanlogregCV(alpha, niter, lam, interactions, dataset=dba):
    """
    Returns score on CV data given
    """
    tempds = logreg_prepdata(dataset, interactionlist=interactions)[
        0
    ]  # prepare the data, yielding an (m, 13 + Nint) array
    tempds = Datasets(tempds)  # make the data a class instance, so as to segregate it by train, cv, and test
    trainds = tempds.train()  # pull out the training data
    y = trainds[0::, 0]
    x = trainds[0::, 1::]
    graddes = gradientdescent(x, y, alpha, niter, lam, logreg=True)
    thetapreds = graddes[0]  # predicted values of thetas from grad des
    cvds = tempds.cv()  # pull up the CV data to test our prediction on
    cvpreds = surpred(cvds[0::, 1::], thetapreds)  # generate the predicted survivals
    cvscore = scorepreds(cvds, cvpreds)  # compare the predictions to true results for CV
    return cvscore
コード例 #2
0
def logreg_sex_and_class():
    """
    Logistic Regression using both sex and class as independent feature vars
    """
    y = data[::,0]
    m = np.size(y)
    y = y.reshape([m,1])

    xs = data[::, 2].reshape([m,1])
    xcl = data[::, 1].reshape([m,1])
    xcl1 = (xcl[::] == 1).astype(int) # convert class into two separate binary vars. xcl1=1 only if 1st class. 0 else
    xcl2 = int(xcl[::] == 2) # xcl2=1 only if 2nd class. 0 else
    xcl3 = int(xcl == 3) # not used

    xones = np.ones([m,1])
    print np.shape(xcl2), np.shape(xs)
    x1int = ((xs.reshape(m))*(xcl1.reshape(m))).reshape([m,1]) # include interaction terms sex*cl1 and sex*cl2
    x2int = ((xs.reshape(m))*(xcl2.reshape(m))).reshape([m,1])

    x = np.hstack([xones, xcl1, xcl2, xs, x1int, x2int]) # full x array

    alpha = 0.1
    niter = 40000
    lam = 0
    graddes = gradientdescent(x,y,alpha,niter,lam, logreg = True)

    thetapred = graddes[0]
    Jsteps = graddes[1]
    print "prediction for theta:", thetapred
    #print Jsteps

    scatter(np.arange(niter)+1,Jsteps)
    xlabel("Number of iterations")
    ylabel("Jcost")
    title("The convergence of the cost function")
    show()

    for cl in [1,2,3]: # generates the predicted survival table
        cl1 = int(cl == 1)
        cl2 = int(cl == 2)
        cl3 = int(cl == 3)
        for sex in [0,1]:
            print "class=", cl, "and sex=", sex
            xx = np.array([1,cl1,cl2,sex, cl1*sex, cl2*sex])
            print glog(np.dot(thetapred, xx))
    print "Time elapsed:", time() - start
コード例 #3
0
    def interactionterms():
        """
        This runs grad des with 1 quadratic interaction term at a time (and all linear terms), and compares the
        result to the linear case.
        """
        posints = []
        for (i,j) in intlist:
            xint = (xlindba[0::, i] * xlindba[0::, j]).reshape(m,1)
            xlin1int = np.hstack([ xlindba, xint])
            graddes = gradientdescent(xlin1int, y, 0.3, 10000, 0, logreg = True)
            thetapred = graddes[0]
            pred = surpred(xlin1int, thetapred)
            scoreint = predicttrain(pred)
            dif = scoreint-scorelin
            print (i,j), "  ", scoreint, "  ", round(dif*m)
            if dif > 0:
                posints.append((i,j))

        print posints
コード例 #4
0
def farelogreg():
    """
    This runs logistic regression using only the fare variable as the predictor for y = survival.
    """
    #datass = dfrange(20, 100, 3, df([[3,1],[1,2],[0,7]],db))
    datass = db
    y = datass[::,0]
    m = np.size(y)
    y = y.reshape([m,1])

    x6 = datass[::, 6].reshape([m,1])
    xones = np.ones([m,1])
    x = np.hstack([xones,x6])

    fs = featurescale(x)
    xfs = fs[0]
    means = fs[1]
    stds = fs[2]

    alpha = 0.2
    niter = 1000
    lam = 0

    scatter(x6,y)
    show()

    graddes = gradientdescent(xfs,y,alpha,niter,lam, logreg = True)

    thetapred = graddes[0]
    Jsteps = graddes[1]
    print "prediction for theta:", thetapred
    print means, stds

    scatter(np.arange(niter)+1,Jsteps)
    xlabel("Number of iterations")
    ylabel("Jcost")
    title("The convergence of the cost function")
    show()

    print "Time elapsed:", time() - start
コード例 #5
0
def logreg_sexonly():
    """
    Here we implement logistic regression using only sex as the independent predictor var
    """
    y = data[::,0]
    m = np.size(y)
    y = y.reshape([m,1])
    x2 = data[::,2].reshape([m,1])

    xones = np.ones([m,1])
    print np.shape(xones), np.shape(x2)
    x = np.hstack([xones,x2])

    alpha = 0.1
    niter = 10000
    lam = 0
    graddes = gradientdescent(x,y,alpha,niter,lam, logreg = True)

    thetapred = graddes[0]
    Jsteps = graddes[1]
    print "prediction for theta:", thetapred
    #print Jsteps

    scatter(np.arange(niter)+1,Jsteps)
    xlabel("Number of iterations")
    ylabel("Jcost")
    title("The convergence of the cost function")
    show()

    xf = np.array([1,1]) # X for females
    xm = np.array([1,0]) # X for males

    Pf = glog(np.dot(thetapred,xf)) # predicted survival probabilities for female and male
    Pm = glog(np.dot(thetapred,xm))

    print Pf, Pm

    print "Time elapsed:", time() - start
コード例 #6
0
def logreg_sex_class_city():
    """
    Incorporating sex, class, and city into the logistic regression.
    """
    y = data[::,0]
    m = np.size(y)
    y = y.reshape([m,1])

    xs = data[::, 2].reshape([m,1])
    xcl = data[::, 1].reshape([m,1])
    xem = data[::, 7].reshape([m,1])

    xcl1 = (xcl == 1).astype(int)
    xcl2 = (xcl == 2).astype(int)
    #xcl3 = (xcl == 3).astype(int)
    xem1 = (xem == 0).astype(int)
    xem2 = (xem == 1).astype(int)
    #xem3 = (xem == 2).astype(int)

    xones = np.ones([m,1])

    xscl1 = ((xs.reshape(m))*(xcl1.reshape(m))).reshape([m,1])
    xscl2 = ((xs.reshape(m))*(xcl2.reshape(m))).reshape([m,1])
    xsem1 = ((xs.reshape(m))*(xem1.reshape(m))).reshape([m,1])
    xsem2 = ((xs.reshape(m))*(xem2.reshape(m))).reshape([m,1])

    xcl1em1 = ((xcl1.reshape(m))*(xem1.reshape(m))).reshape([m,1])
    xcl2em1 = ((xcl2.reshape(m))*(xem1.reshape(m))).reshape([m,1])
    xcl1em2 = ((xcl1.reshape(m))*(xem2.reshape(m))).reshape([m,1])
    xcl2em2 = ((xcl2.reshape(m))*(xem2.reshape(m))).reshape([m,1])

    xscl1em1 = ((xs.reshape(m))*(xcl1.reshape(m))*(xem1.reshape(m))).reshape([m,1])
    xscl2em1 = ((xs.reshape(m))*(xcl2.reshape(m))*(xem1.reshape(m))).reshape([m,1])
    xscl1em2 = ((xs.reshape(m))*(xcl1.reshape(m))*(xem2.reshape(m))).reshape([m,1])
    xscl2em2 = ((xs.reshape(m))*(xcl2.reshape(m))*(xem2.reshape(m))).reshape([m,1])

    doubles = np.hstack([xscl1, xscl2, xsem1, xsem2, xcl1em1, xcl2em1, xcl1em2, xcl2em2]) #quadratic interactions
    triples = np.hstack([xscl1em1, xscl2em1, xscl1em2, xscl2em2]) #cubic interactions


    x = np.hstack([xones, xcl1, xcl2, xs, xem1, xem2, doubles, triples])
    # note that after running with the triples on and off there was virtually no difference in results...
    # perhaps only linear and quadratic terms are necessary?

    alpha = 0.3
    niter = 50000
    lam = 0
    graddes = gradientdescent(x,y,alpha,niter,lam, logreg = True)

    thetapred = graddes[0]
    Jsteps = graddes[1]
    print "prediction for theta:", thetapred
    #print Jsteps

    scatter(np.arange(niter)+1,Jsteps)
    xlabel("Number of iterations")
    ylabel("Jcost")
    title("The convergence of the cost function")
    show()

    for cl in [1,2,3]: # create predicted survival table
        cl1 = int(cl == 1)
        cl2 = int(cl == 2)
        cl3 = int(cl == 3)
        for em in [0,1,2]:
            em1 = int(em == 0)
            em2 = int(em == 1)
            em3 = int(em == 2)
            for sex in [0,1]:
                print "class=", cl, "and sex=", sex, "and emb=", em
                xx = np.array([1,cl1,cl2,sex, em1, em2, cl1*sex, cl2*sex, em1*sex, em2*sex,
                               cl1*em1, cl2*em1, cl1*em2, cl2*em2, sex*cl1*em1, sex*cl2*em1, sex*cl1*em2, sex*cl2*em2])
                print glog(np.dot(thetapred, xx))
    print "Time elapsed:", time() - start
コード例 #7
0
    tbaprep = logreg_prepdata(tba) # the test data
    xlintba = tbaprep[0]
    meanstba = tbaprep[1]
    stdstba = tbaprep[2]

    print "means for age and fare, train data: ", meansdba
    print "means for age and fare, test data: ", meanstba
    print "stds for age and fare, train data: ", stdsdba
    print "stds for age and fare, test data: ", stdstba

    alpha = 0.3
    niter = 10000
    lam = 0

    graddes = gradientdescent(xlindba, y, alpha, niter, lam, logreg = True)
    thetapredlin = graddes[0]
    Jsteps = graddes[1]
    print "prediction for theta with only linear terms:"
    print thetapredlin

    #scatter(np.arange(niter)+1,Jsteps)
    #xlabel("Number of iterations")
    #ylabel("Jcost")
    #title("The convergence of the cost function")
    #show()

    predlin = surpred(xlindba, thetapredlin)
    scorelin = predicttrain(predlin)
    print "scorelin", scorelin
コード例 #8
0
print x # this is our (6,m) feature array

n = np.shape(x)[1]-1

# Now feature scale
fs = featurescale(x)
xfs = fs[0]
means = fs[1]
stds = fs[2]

#Run grad des
alpha = 0.1
niter = 1000
lam = 0

graddes = gradientdescent(xfs,y,alpha,niter,lam)
thetapred = graddes[0]
Jsteps = graddes[1]
print "prediction for theta:", thetapred
#print Jsteps

scatter(np.arange(niter)+1,Jsteps)
xlabel("Number of iterations")
ylabel("Jcost")
title("The convergence of the cost function")
show()

X,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10 = sympy.symbols('X,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10')
XN = [1,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10]
#Creates algebraic symbols to represent final prediction.
# extend this manually if you want more than 10 features
コード例 #9
0
# This is just to convince yourself that the cost function is defined properly for log reg
#theta = (np.array([-1,0.05])).reshape(2,1)
#print np.shape(theta)
#h = glog(np.dot(xx,theta))
#print "h", np.shape(h)
#print "log(h)", np.shape(np.log(h))
#costi = y*np.log(h) + (1-y)*np.log(1-h)
#J = -(1/float(m)) * sum(costi)
#print J

alpha = 0.1
niter = 10000
lam = 0

# Perform gradient descent
graddes = gradientdescent(xfs, y, alpha, niter, lam, logreg = True)
thetapred = graddes[0]
Jsteps = graddes[1]
print "prediction for theta:", thetapred

# Verify convergence
scatter(np.arange(niter)+1,Jsteps)
xlabel("Number of iterations")
ylabel("Jcost")
title("The convergence of the cost function")
show()

# Write hypothesis
X1 = sympy.symbols('X1')
X1 = (X1 - means[0])/stds[0]
XN = np.array([1,X1])