def bayespreddepend(x, dataset = db):
    db = dataset.copy()
    x0 = list(x) #list of features [[value1, index1],[value2,index2],...]
    x1 = list(x) #list of features [[value1, index1],[value2,index2],...]
    n = np.size(x,0)
    x0.append([0,0]) #all of the features x, along with die
    x1.append([1,0]) #all of the features x, along with live
    if (df(x0,dataset) == []) or (df(x1,dataset) == []):
        return 100
    nxand0 = showstats(df(x0,dataset))[1]
    nxand1 = showstats(df(x1,dataset))[1]
    return (1/ (( float(nxand0)/float(nxand1) ) + 1))
def pxy(value, index, y, dataset, laplacesmooth = "on"):
    """
    computes prob(x|y), i.e the probability of having feature x=(value,index) when the survival value is y (0 or 1)
    This is accomplished using frequencies in dataset to approximate p(x and y)/p(y)
    i.e using maximum likelihood estimation (MLE)
    Laplace smoothing has the effect of giving small nonzero values for features which don't appear
    in the training data. It has a pretty small effect overall
    """
    dataset = dataset.copy()
    # [0=sur, 1=class, 2=sex, 3=age, 4=sibsp, 5=parch, 6=fare, 7=embarked]
    J = [0,3,2,3,3,3,1,3] # number of discrete feature values for each index, assuming age is binned into
    # (0-10), (11-19), 20+ and sibsp
    J1 = 1
    if laplacesmooth == "off":
        J = [0,0,0,0,0,0,0,0]
        J1 = 0*J1
    if df([[y,0],[value,index]],dataset) == []:
        nxy = 0
    else:
        nxy = showstats(df([[y,0],[value,index]],dataset))[1]
    ny = showstats(df([[y,0]],dataset))[1]
    return float(J1 + nxy)/(J[index] + ny)
def bayespred(x, dataset = db):
    db = dataset.copy()
    x = list(x) #list of features [[value1, index1],[value2,index2],...]
    n = np.size(x,0)
    px0s = []
    px1s = []
    for a in xrange(n):
        px0s.append( pxy(x[a][0], x[a][1], 0, db, laplacesmooth = "on") ) #(x[a][0],x[a][1]) gives the value and # index of feature x[a]
        px1s.append( pxy(x[a][0], x[a][1], 1, db, laplacesmooth = "on") )
    pX0 = prod(px0s)
    pX1 = prod(px1s)
    p1 = showstats(db)[2]
    p0 = 1-p1
    #print pX0,pX1,p0,p1
    pYgivenX = pX1*p1/(pX1*p1+pX0*p0)
    return pYgivenX


    totaltime = time() - start
    print "This code took %f s to run" %totaltime

    import sys
    sys.exit()


    for sex in xrange(2):
        for cl in xrange(1,4):
            feat = [[cl,1],[sex,2]]
            print "class=",cl, "and sex=",sex
            print bayespred(feat)
            print showstats(df(feat,data))

    xvals = [ [1,2,3], 1], [ [0,1], 2 ], [ [5,15,50], 3], [ [0,1,3], 4 ], [ [0,1,3], 5], [ [0,1,2], 7 ]

    for cl in [1,2,3]:
        for sex in [0,1]:
            for age in [5,15,50]:
                for sib in [0,1,3]:
                    for par in [0,1,3]:
                        for emb in [0,1,2]:
                            xval = [[cl,1], [sex,2], [age,3], [sib,4], [par,5], [emb,7]]
                            bp = bayespred(xval)
                            # there are 2*(3^5) = 486 different categories given the way we binned the data.
                            # We want to display only the results that disagree with the F3SM12 model
                            if (sex == 1) and (bp <= 0.5) and not( (cl == 3) and (emb == 0) ):
                                print xval, "yields:", bp
Beispiel #5
0
def showsurvivaltablesanalysis():
    # Now we can easily recreate the survival tables from predict.py, but more elegantly:
    malestats = showstats(df([[0,2]],data))
    femstats = showstats(df([[1,2]],data))

    print "female and male stats"
    print femstats
    print malestats

    sexclass=[]
    for s in xrange(2):
        for c in xrange(1,4):
            sexclass.append(showstats(df([[s,2],[c,1]],data)))

    sca = np.array(sexclass).reshape(2,3,3)
    print "Sex-Class"
    print sca

    #sex-class-embarked
    malesce=[]
    femsce=[]
    for c in xrange(1,4):
        for e in xrange(3):
            malesce.append(showstats(df([[0,2],[c,1],[e,7]],data)))
            femsce.append(showstats(df([[1,2],[c,1],[e,7]],data)))

    msce = np.array(malesce).reshape(3,3,3)
    fsce = np.array(femsce).reshape(3,3,3)

    print "Male Class(1st block is 1st class) and City (rows 0-2 in each block)"
    print msce
    print "Female Class(1st block is 1st class) and City (rows 0-2 in each block)"
    print fsce

    print "Male sibsp"
    for sibsp in xrange(10):
        print sibsp, showstats(df([[0,2],[sibsp,4]],data))

    print "Female sibsp"
    for sibsp in xrange(10):
        print sibsp, showstats(df([[1,2],[sibsp,4]],data))

    print "Male parch"
    for parch in xrange(10):
        print parch, showstats(df([[0,2],[parch,5]],data))

    print "Female parch"
    for parch in xrange(10):
        print parch, showstats(df([[1,2],[parch,5]],data))

    for sib in xrange(3):
        for par in xrange(3):
            print sib,par,"male: %s" %showstats(df([[0,2],[sib,4],[par,5]],data)), \
            "female: %s" %showstats(df([[1,2],[sib,4],[par,5]],data))

    print "1st class males sibsp"
    for sibsp in xrange(10):
        print sibsp, showstats(df([[0,2],[1,1],[sibsp,4]],data))

    print "1st class males parch"
    for parch in xrange(10):
        print parch, showstats(df([[0,2],[1,1],[parch,5]],data))

    #all young males that survive
    #print df([[0,2],[1,0]],dfrange(0,10,3,data))

    print "1st and 2nd class French males by sibsp"
    for sibsp in xrange(2):
        print "age 0-80, sibsp = %i" %sibsp, showstats(df([[0,2],[1,7],[sibsp,4]],dfrange(0,80,3,dfrange(1,2,1,data))))
        print "age 0-19, sibsp = %i" %sibsp, showstats(df([[0,2],[1,7],[sibsp,4]],dfrange(0,19,3,dfrange(1,2,1,data))))
        print "age 20-80, sibsp = %i" %sibsp, showstats(df([[0,2],[1,7],[sibsp,4]],dfrange(20,80,3,dfrange(1,2,1,data))))

    print showstats(df([[0,2],[1,7]],data))

    print "3rd class young males by age bin"
    for x in xrange(5):
        print showstats(df([[0,2],[3,1]],dfrange(2*x+0.01,2*(x+1),3,data)))

    print "3rd class S females under age<=5"
    print showstats(df([[1,2],[0,7]],dfrange(3,3,1,dfrange(0,5,3,data))))
    print "3rd class S females under 18 with sibsp=0,1"
    print df([[1,2],[0,7]],dfrange(0,1,4,dfrange(3,3,1,dfrange(0,18,3,data))))
    print showstats(df([[1,2],[0,7]],dfrange(0,1,4,dfrange(3,3,1,dfrange(0,18,3,data)))))


    print "young (age<=12) 3rd class males by sibsp"
    print "sibsp=0,1:",showstats(df([[0,2],[3,1]],dfrange(0,1,4,dfrange(0,12,3,data))))
    print "sibsp=2-8:",showstats(df([[0,2],[3,1]],dfrange(2,8,4,dfrange(0,12,3,data))))


    print "3rd class girls then boys (age<=15) with many siblings (sibsp>=2)"
    print showstats(df([[1,2]],dfrange(2,8,4,dfrange(3,3,1,dfrange(0,15,3,data)))))
    print showstats(df([[0,2]],dfrange(2,8,4,dfrange(3,3,1,dfrange(0,15,3,data)))))

    print "3rd class young girls from C or Q with many siblings"
    print showstats(df([[1,2],[3,3]],dfrange(2,8,4,dfrange(1,2,7,dfrange(0,15,3,data)))))

    print "3rd class S female young (age<=8) with 0 or 1 sibling"
    print showstats(df([[1,2],[0,7]],dfrange(0,1,4,dfrange(3,3,1,dfrange(0,8,3,data)))))
    print df([[1,2],[0,7]],dfrange(0,1,4,dfrange(3,3,1,dfrange(0,8,3,data))))
    #print showstats(df([[1,2],[0,7]],dfrange(0,1,4,dfrange(3,3,1,dfrange(18,80,3,data)))))

    # It looks like having lots of siblings is really bad for you. Conversely, having 0 or 1 seems to save
    # otherwise damned souls F3Syoung and M3young.

    print "3rd class S females in test data!! under 18 with sibsp=0,1"
    print df([[1,2],[0,7]],dfrange(0,1,4,dfrange(3,3,1,dfrange(0,18,3,test8))))

    # No-age-given analysis
    print "The average age for passengers. "\
          "NOTE this should be run only when placeholder age for age not given is set to 1000"
    print np.mean(dfrange(0,100,3,data)[0::,3]) #mean of age for age <=100
    # However, we might do a bit better. Generally, sibsp>1 implies you are a child. Let's see this.
    print "Average age by sibsp:"
    for sp in xrange(6):
        print "sibsp = %i" %sp, np.mean(dfrange(sp,sp,4,dfrange(0,100,3,data))[0::,3])

    # Random Forests

    # It feels like fare is a redundant variable, as it tracks closely with class (and perhaps city).
    # Perhaps we should train the forest without fare, and maybe also parch.
    # And while we are at it try without sibsp again. Let's create the necessary data.

    data7 = scipy.delete(data,6,1) #delete fare column, resulting in a (891,7) array
    data6 = scipy.delete(data7,5,1) #delete parch column, resulting in a (891,6) array
    data5 = scipy.delete(data6,4,1) #delete sibsp column, resulting in a (891,5) array
    # And we'll want to test these forests on the test data too, which must be of the same form:
    test7 = scipy.delete(test8,6,1)
    test6 = scipy.delete(test7,5,1)
    test5 = scipy.delete(test6,4,1)


    # The function randomforests in predictions.py runs many forests and finds the average prediction.

    # RFC105 = randomforests(10,100,data5,test5[0::,1::])
    # print np.nonzero(f3sm12pred(test8) - RFC105)[0]
    # this confirms the results of randforest.py for which passengers RFC and F3SM12 disagree

    # Let's apply the RFC method to both the train data (this helps to see the degree of over-fitting)
    # as well as the test data, to make new predictions.
    numfor = 5
    RFC8 = randomforests(numfor,100,data,test8[0::,1::])
    RFC7 = randomforests(numfor,100,data7,test7[0::,1::])
    RFC6 = randomforests(numfor,100,data6,test6[0::,1::])
    RFC5 = randomforests(numfor,100,data5,test5[0::,1::])
    RFC8train = randomforests(numfor,100,data,data[0::,1::])
    RFC7train = randomforests(numfor,100,data7,data7[0::,1::])
    RFC6train = randomforests(numfor,100,data6,data6[0::,1::])
    RFC5train = randomforests(numfor,100,data5,data5[0::,1::])


    print "Scores for 'predictions' back on train data for GM, F3SM12, newpred, and RFC8, RFC7, RFC6, RFC5"
    print "GM", predicttrain(genderpred(data))
    print "F3SM12", predicttrain(f3sm12pred(data))
    print "new", predicttrain(newpred(data))
    print "RFC8",predicttrain(RFC8train)
    print "RFC7",predicttrain(RFC7train)
    print "RFC6",predicttrain(RFC6train)
    print "RFC5",predicttrain(RFC5train)


    print "Comparing predictions"
    comparepreds(newpred(test8),RFC8)
    comparepreds(f3sm12pred(test8),RFC8)
    comparepreds(newpred(test8),f3sm12pred(test8))
    comparepreds(RFC8,RFC7)
    comparepreds(RFC7,RFC6)
    comparepreds(RFC6,RFC5)

    # Scores for predictions on [test, train] data sets.
    # Note the RFC5 (neglecting fare, sibsp, parch) prediction on train is random,
    # so changes each time (usually is around 0.85)
    # The first entries are our scores from Kaggle.com.
    # The spread indicates the degree of over-fitting. Clearly RFC over-fits the most.
    # However, all of the prediction models do worse on the test data than the train data.
    # This didn't have to be the case, particularly for the simpler models (GM and F3SM12)

    scoreGM = [0.76555, predicttrain(genderpred(data))] #160/209 right
    scoreF3SM12 = [0.78947, predicttrain(f3sm12pred(data))] #165/209
    scorenew = [0.78469, predicttrain(newpred(data))] #164/209
    scoreRFC5 = [0.77033, predicttrain(RFC5train)] #161/209
    scoreRFC7 = [0.77512, predicttrain(RFC7train)] #162/209
    # Note that only half of the test data (209) is used on the leaderboard.

    print "Scores for predictions on [test, train] data sets for GM, F3SM12, newpred, RFC5, RFC7"
    print scoreGM
    print scoreF3SM12
    print scorenew
    print scoreRFC5
    print scoreRFC7
Beispiel #6
0
    tempdata = test8 # test8 can be used to look at passenger attributes. But note unknown survival value = 2.

print "Recall indices [0=sur, 1=class, 2=sex, 3=age, 4=sibsp, 5=parch, 6=fare, 7=embarked]"
while True:
    query = raw_input("Input a feature constraint (in form 'min max index')(type 'x' to quit inputting constraints): ")
    if query == "":
        break
    if query == "x":
        break
    query = [float(x) for x in query.split()] # split breaks the 3 string inputs up, and they are then floated
    fmin = query[0]
    fmax = query[1]
    index = query[2]
    print "Great! We'll apply the constraint: %i <= %s <= %i" %(fmin, indict8[index], fmax)
    constraints.append(query)
print "To summarize, you said constrain data by:", constraints

ncon = np.size(constraints)/3

for x in xrange(ncon):
    fmin = constraints[x][0]
    fmax = constraints[x][1]
    index = constraints[x][2]
    tempdata = dfrange(fmin,fmax,index,tempdata)

fareavg = round(np.mean(tempdata[::,6]),1)

print tempdata
print showstats(tempdata)
print "The average fare in this group is:",  fareavg