def logreg_sex_and_class():
    """
    Logistic Regression using both sex and class as independent feature vars
    """
    y = data[::,0]
    m = np.size(y)
    y = y.reshape([m,1])

    xs = data[::, 2].reshape([m,1])
    xcl = data[::, 1].reshape([m,1])
    xcl1 = (xcl[::] == 1).astype(int) # convert class into two separate binary vars. xcl1=1 only if 1st class. 0 else
    xcl2 = int(xcl[::] == 2) # xcl2=1 only if 2nd class. 0 else
    xcl3 = int(xcl == 3) # not used

    xones = np.ones([m,1])
    print np.shape(xcl2), np.shape(xs)
    x1int = ((xs.reshape(m))*(xcl1.reshape(m))).reshape([m,1]) # include interaction terms sex*cl1 and sex*cl2
    x2int = ((xs.reshape(m))*(xcl2.reshape(m))).reshape([m,1])

    x = np.hstack([xones, xcl1, xcl2, xs, x1int, x2int]) # full x array

    alpha = 0.1
    niter = 40000
    lam = 0
    graddes = gradientdescent(x,y,alpha,niter,lam, logreg = True)

    thetapred = graddes[0]
    Jsteps = graddes[1]
    print "prediction for theta:", thetapred
    #print Jsteps

    scatter(np.arange(niter)+1,Jsteps)
    xlabel("Number of iterations")
    ylabel("Jcost")
    title("The convergence of the cost function")
    show()

    for cl in [1,2,3]: # generates the predicted survival table
        cl1 = int(cl == 1)
        cl2 = int(cl == 2)
        cl3 = int(cl == 3)
        for sex in [0,1]:
            print "class=", cl, "and sex=", sex
            xx = np.array([1,cl1,cl2,sex, cl1*sex, cl2*sex])
            print glog(np.dot(thetapred, xx))
    print "Time elapsed:", time() - start
def logreg_sexonly():
    """
    Here we implement logistic regression using only sex as the independent predictor var
    """
    y = data[::,0]
    m = np.size(y)
    y = y.reshape([m,1])
    x2 = data[::,2].reshape([m,1])

    xones = np.ones([m,1])
    print np.shape(xones), np.shape(x2)
    x = np.hstack([xones,x2])

    alpha = 0.1
    niter = 10000
    lam = 0
    graddes = gradientdescent(x,y,alpha,niter,lam, logreg = True)

    thetapred = graddes[0]
    Jsteps = graddes[1]
    print "prediction for theta:", thetapred
    #print Jsteps

    scatter(np.arange(niter)+1,Jsteps)
    xlabel("Number of iterations")
    ylabel("Jcost")
    title("The convergence of the cost function")
    show()

    xf = np.array([1,1]) # X for females
    xm = np.array([1,0]) # X for males

    Pf = glog(np.dot(thetapred,xf)) # predicted survival probabilities for female and male
    Pm = glog(np.dot(thetapred,xm))

    print Pf, Pm

    print "Time elapsed:", time() - start
def logreg_sex_class_city():
    """
    Incorporating sex, class, and city into the logistic regression.
    """
    y = data[::,0]
    m = np.size(y)
    y = y.reshape([m,1])

    xs = data[::, 2].reshape([m,1])
    xcl = data[::, 1].reshape([m,1])
    xem = data[::, 7].reshape([m,1])

    xcl1 = (xcl == 1).astype(int)
    xcl2 = (xcl == 2).astype(int)
    #xcl3 = (xcl == 3).astype(int)
    xem1 = (xem == 0).astype(int)
    xem2 = (xem == 1).astype(int)
    #xem3 = (xem == 2).astype(int)

    xones = np.ones([m,1])

    xscl1 = ((xs.reshape(m))*(xcl1.reshape(m))).reshape([m,1])
    xscl2 = ((xs.reshape(m))*(xcl2.reshape(m))).reshape([m,1])
    xsem1 = ((xs.reshape(m))*(xem1.reshape(m))).reshape([m,1])
    xsem2 = ((xs.reshape(m))*(xem2.reshape(m))).reshape([m,1])

    xcl1em1 = ((xcl1.reshape(m))*(xem1.reshape(m))).reshape([m,1])
    xcl2em1 = ((xcl2.reshape(m))*(xem1.reshape(m))).reshape([m,1])
    xcl1em2 = ((xcl1.reshape(m))*(xem2.reshape(m))).reshape([m,1])
    xcl2em2 = ((xcl2.reshape(m))*(xem2.reshape(m))).reshape([m,1])

    xscl1em1 = ((xs.reshape(m))*(xcl1.reshape(m))*(xem1.reshape(m))).reshape([m,1])
    xscl2em1 = ((xs.reshape(m))*(xcl2.reshape(m))*(xem1.reshape(m))).reshape([m,1])
    xscl1em2 = ((xs.reshape(m))*(xcl1.reshape(m))*(xem2.reshape(m))).reshape([m,1])
    xscl2em2 = ((xs.reshape(m))*(xcl2.reshape(m))*(xem2.reshape(m))).reshape([m,1])

    doubles = np.hstack([xscl1, xscl2, xsem1, xsem2, xcl1em1, xcl2em1, xcl1em2, xcl2em2]) #quadratic interactions
    triples = np.hstack([xscl1em1, xscl2em1, xscl1em2, xscl2em2]) #cubic interactions


    x = np.hstack([xones, xcl1, xcl2, xs, xem1, xem2, doubles, triples])
    # note that after running with the triples on and off there was virtually no difference in results...
    # perhaps only linear and quadratic terms are necessary?

    alpha = 0.3
    niter = 50000
    lam = 0
    graddes = gradientdescent(x,y,alpha,niter,lam, logreg = True)

    thetapred = graddes[0]
    Jsteps = graddes[1]
    print "prediction for theta:", thetapred
    #print Jsteps

    scatter(np.arange(niter)+1,Jsteps)
    xlabel("Number of iterations")
    ylabel("Jcost")
    title("The convergence of the cost function")
    show()

    for cl in [1,2,3]: # create predicted survival table
        cl1 = int(cl == 1)
        cl2 = int(cl == 2)
        cl3 = int(cl == 3)
        for em in [0,1,2]:
            em1 = int(em == 0)
            em2 = int(em == 1)
            em3 = int(em == 2)
            for sex in [0,1]:
                print "class=", cl, "and sex=", sex, "and emb=", em
                xx = np.array([1,cl1,cl2,sex, em1, em2, cl1*sex, cl2*sex, em1*sex, em2*sex,
                               cl1*em1, cl2*em1, cl1*em2, cl2*em2, sex*cl1*em1, sex*cl2*em1, sex*cl1*em2, sex*cl2*em2])
                print glog(np.dot(thetapred, xx))
    print "Time elapsed:", time() - start
# Write hypothesis
X1 = sympy.symbols('X1')
X1 = (X1 - means[0])/stds[0]
XN = np.array([1,X1])
hypz = np.dot(thetapred,XN) # the hypothesis is hyp = glog(hypz)
print "hypothesis for z (take glog(z) for actual hypothesis prob): ", hypz

# Plot the hypothesis CHD probability
ages = (np.arange(20,81,1)).reshape(61,1)

agesfs = (ages-means[0])/stds[0]
na = np.size(agesfs)
X0 = np.ones([na,1])
Xa = np.hstack([X0, agesfs])
#print Xa

chd = glog( np.dot( Xa, thetapred.T) )

plot1 = plot(ages,chd, 'b')
xlim(20,80)
ylim(-0.1,1.3)
xlabel("Age")
ylabel("Probability of CHD")
title("Hypothesis probability of CHD by age")
plot2 = plot(x,y,'ro') # plot the original data
#legend([plot1], ("blue line"), "best", numpoints = 1)
show()


print "Time elapsed:", time() - start