Esempio n. 1
0
    def fit_model_1(self, lol = 0.0025, toWrite = False):
        model = SVC(probability = True, kernel = 'rbf', tol = 1e-3, gamma = 0.001, coef0 = 0.0)

        for data in self.cv_data:
            X_train, X_test, Y_train, Y_test = data
            model.fit(X_train,Y_train)
            pred = model.predict_proba(X_test)[:,1]
            print("Model 1 score: %f" % (logloss(Y_test,pred),))
        if toWrite:
            f2 = open('model1/model.pkl','w')
            pickle.dump(model,f2)
Esempio n. 2
0
    def fit_model_22(self, lol = 2, toWrite = False):
        model = SVC(probability = True, kernel = 'sigmoid', tol = 1e-3, coef0 = lol)

        for data in self.cv_data:
            X_train, X_test, Y_train, Y_test = data
            model.fit(X_train,Y_train)
            pred = model.predict_proba(X_test)[:,1]
            print("Model 22 score: %f" % (logloss(Y_test,pred),))
        if toWrite:
            f2 = open('model22/model.pkl','w')
            pickle.dump(model,f2)
Esempio n. 3
0
    def fit_model_20(self, lol = 0.0025, toWrite = False):
        model = SVC(probability = True, kernel = 'linear', class_weight = 'auto', tol = 1e-3)

        for data in self.cv_data:
            X_train, X_test, Y_train, Y_test = data
            model.fit(X_train,Y_train)
            pred = model.predict_proba(X_test)[:,1]
            print("Model 20 score: %f" % (logloss(Y_test,pred),))
        if toWrite:
            f2 = open('model20/model.pkl','w')
            pickle.dump(model,f2)
Esempio n. 4
0
    def fit_model_16(self,toWrite=False):
        model = ARDRegression()

        for data in self.cv_data:
            X_train, X_test, Y_train, Y_test = data
            model.fit(X_train,Y_train)
            pred = model.predict(X_test)
            print("Model 16 score %f" % (logloss(Y_test,pred),))

        if toWrite:
            f2 = open('model16/model.pkl','w')
            pickle.dump(model,f2)
            f2.close()
Esempio n. 5
0
    def fit_model_14(self,toWrite=False):
        model = OrthogonalMatchingPursuit()

        for data in self.cv_data:
            X_train, X_test, Y_train, Y_test = data
            model.fit(X_train,Y_train)
            pred = model.predict(X_test)
            print("Model 14 score %f" % (logloss(Y_test,pred),))

        if toWrite:
            f2 = open('model14/model.pkl','w')
            pickle.dump(model,f2)
            f2.close()
Esempio n. 6
0
    def fit_model_12(self,toWrite=False):
        model = ElasticNet(alpha=1.0)

        for data in self.cv_data:
            X_train, X_test, Y_train, Y_test = data
            model.fit(X_train,Y_train)
            pred = model.predict(X_test)
            print("Model 12 score %f" % (logloss(Y_test,pred),))

        if toWrite:
            f2 = open('model12/model.pkl','w')
            pickle.dump(model,f2)
            f2.close()
Esempio n. 7
0
    def fit_model_11(self,toWrite=False):
        model = LassoLars(alpha=1,max_iter=5000)

        for data in self.cv_data:
            X_train, X_test, Y_train, Y_test = data
            model.fit(X_train,Y_train)
            pred = model.predict(X_test)
            print("Model 11 score %f" % (logloss(Y_test,pred),))

        if toWrite:
            f2 = open('model11/model.pkl','w')
            pickle.dump(model,f2)
            f2.close()
Esempio n. 8
0
    def fit_model_9(self,toWrite=False):
        model = GaussianNB()

        for data in self.cv_data:
            X_train, X_test, Y_train, Y_test = data
            model.fit(X_train,Y_train)
            pred = model.predict_proba(X_test)[:,1]
            print("Model 9 score %f" % (logloss(Y_test,pred),))

        if toWrite:
            f2 = open('model9/model.pkl','w')
            pickle.dump(model,f2)
            f2.close()
Esempio n. 9
0
    def fit_model_8(self,lol = 0.0, toWrite=False):
        model = BernoulliNB(alpha = lol, binarize = 0.0)

        for data in self.cv_data:
            X_train, X_test, Y_train, Y_test = data
            model.fit(X_train,Y_train)
            pred = model.predict_proba(X_test)[:,1]
            print("Model 8 score %f" % (logloss(Y_test,pred),))

        if toWrite:
            f2 = open('model8/model.pkl','w')
            pickle.dump(model,f2)
            f2.close()
Esempio n. 10
0
    def fit_model_7(self,toWrite=False):
        model = NuSVC(probability=True,kernel='linear')

        for data in self.cv_data:
            X_train, X_test, Y_train, Y_test = data
            model.fit(X_train,Y_train)
            pred = model.predict_proba(X_test)[:,1]
            print("Model 7 score %f" % (logloss(Y_test,pred),))

        if toWrite:
            f2 = open('model7/model.pkl','w')
            pickle.dump(model,f2)
            f2.close()
Esempio n. 11
0
    def fit_model_4(self,toWrite=False):
        model = SVC(kernel='poly',probability=True, degree=2)

        for data in self.cv_data:
            X_train, X_test, Y_train, Y_test = data
            model.fit(X_train,Y_train)
            pred = model.predict_proba(X_test)[:,1]
            print("Model 4 score %f" % (logloss(Y_test,pred),))

        if toWrite:
            f2 = open('model4/model.pkl','w')
            pickle.dump(model,f2)
            f2.close()
Esempio n. 12
0
    def fit_model_10(self,toWrite=False):
        model = BayesianRidge(n_iter=5000)

        for data in self.cv_data:
            X_train, X_test, Y_train, Y_test = data
            model.fit(X_train,Y_train)
            pred = model.predict(X_test)
            print("Model 10 score %f" % (logloss(Y_test,pred),))

        if toWrite:
            f2 = open('model10/model.pkl','w')
            pickle.dump(model,f2)
            f2.close()
Esempio n. 13
0
    def fit_model_6(self,toWrite=False):
        model = RandomForestClassifier(n_estimators=2000,n_jobs=self.cpus)

        for data in self.cv_data:
            X_train, X_test, Y_train, Y_test = data
            X_train,Y_train = self.balance_data(X_train,Y_train)
            model.fit(X_train,Y_train)
            pred = model.predict_proba(X_test)[:,1]
            print("Model 6 score %f" % (logloss(Y_test,pred),))

        if toWrite:
            f2 = open('model6/model.pkl','w')
            pickle.dump(model,f2)
            f2.close()
Esempio n. 14
0
    def fit_model_3(self, lol = 0.011,toWrite = True):
        model = SGDClassifier(penalty = 'l1', loss = 'log', n_iter = 50000, alpha = lol)

        for data in self.cv_data:
            X_train, X_test, Y_train, Y_test = data
            X_train,Y_train = self.balance_data(X_train,Y_train)
            model.fit(X_train,Y_train)
            pred = model.predict_proba(X_test)
            print("Model 3 score : %f" % (logloss(Y_test,pred),))

        if toWrite:
            f2 = open('model3/model.pkl','w')
            pickle.dump(model,f2)
            f2.close()
Esempio n. 15
0
    def fit_model_2(self, lol = .07, toWrite = False):
        model = LogisticRegression(C = lol, penalty = 'l1', tol = 1e-6)

        for data in self.cv_data:
            X_train, X_test, Y_train, Y_test = data
            X_train,Y_train = self.balance_data(X_train,Y_train)
            model.fit(X_train,Y_train)
            pred = model.predict_proba(X_test)[:,1]
            print("Model 2 Score: %f" % (logloss(Y_test,pred),))

        if toWrite:
            f2 = open('model2/model.pkl','w')
            pickle.dump(model,f2)
            f2.close()
Esempio n. 16
0
    def grid_searcher(self):
        X_train, X_test, Y_train, Y_test = self.cv_data[-1]
        X_train = np.vstack((X_train, X_test))
        Y_train = np.concatenate((Y_train, Y_test))
        stratifiedCV = StratifiedKFold(Y_train, 10)

        ansDict = {}
        ansDict["train"] = {}
        ansDict["test"] = {}

        C_range = 10.0 ** np.arange(-4, 9)
        gamma_range = 10.0 ** np.arange(-5, 4)
        for ind, i in enumerate(C_range):
            for jnd, j in enumerate(gamma_range):
                # Cantor's pairs
                dictInd = ((ind + jnd + 2) ** 2 + (ind + 1) - (jnd + 1)) / 2
                ansDict["train"][dictInd] = []
                ansDict["test"][dictInd] = []
                for train, test in stratifiedCV:
                    X_trainT, X_testT, Y_trainT, Y_testT = (
                        X_train[train, :],
                        X_train[test, :],
                        Y_train[train, :],
                        Y_train[test, :],
                    )
                    svc = SVC(kernel="rbf", C=i, gamma=j, probability=True, class_weight="auto")
                    svc.fit(X_trainT, Y_trainT)
                    ansDict["train"][dictInd].append(logloss(Y_trainT, svc.predict_proba(X_trainT)[:, 1]))
                    ansDict["test"][dictInd].append(svc.predict_proba(self.testMat)[:, 1])

        meanScores = []
        for i, j in ansDict["train"].items():
            wut = np.array(j)
            meanScores.append(wut.mean())

        meanScores = np.array(meanScores)
        meanScores[meanScores < 0] = 1.0
        print(meanScores.min())
        paramGood = np.where(meanScores == meanScores.min())[0][0]
        testPred = ansDict["test"][paramGood]
        finalPred = np.vstack(testPred).mean(axis=0)

        def write_prediction(f):
            g = open("sc_prediction.csv", "w")
            for i in f:
                g.write(str(i) + "\n")
            g.close()

        write_prediction(finalPred)
Esempio n. 17
0
    def blend_models(self):
        folders = [
                'model1', 'model2', 'model3', 'model4',
                'model6','model7','model8','model9',
                'model10','model11','model12', 'model14',
                'model15','model16','model18',
                'model19','model20', 'model21', 'model22']
        predict_insteads = (8,9,10,11,12,13)

        models = []
        for folder in folders:
            model_hand = open(folder+'/'+'model.pkl','r')
            models.append(pickle.load(model_hand))
            model_hand.close()

        for derp in self.cv_data:
            X_train, X_test, Y_train, Y_test = derp
            trainLen = Y_test.shape[0]
            modelLen = len(models)
            testLen = self.testMat.shape[0]

            trainBag = np.zeros([trainLen,modelLen],dtype=float)
            testBag = np.zeros([testLen,modelLen],dtype=float)

            for i in xrange(modelLen):
                model = models[i]
                if i in predict_insteads:
                    model.predict_proba = model.predict
                trainPred = model.predict_proba(X_test)
                testPred = model.predict_proba(self.testMat)
                if len(trainPred.shape) > 1:
                    trainPred = trainPred[:,1]
                    testPred = testPred[:,1]
                trainBag[:,i] = trainPred
                testBag[:,i] = testPred
            rf = ExtraTreesClassifier(n_estimators=1000,n_jobs=self.cpus, oob_score=True, bootstrap=True,criterion='gini')
            rf.fit(trainBag,Y_test)
            print("Final score is %f" %(logloss(Y_test,rf.oob_decision_function_[:,1])))
        test_final = rf.predict_proba(testBag)[:,1]
        pred_hand = open('prediction.csv','w')
        for row in test_final:
            pred_hand.write(str(row)+'\n')
        pred_hand.close()
Esempio n. 18
0
def modelg():
# Some settings
    model = GradientBoostingClassifier(loss='deviance', subsample=.5, n_estimators=100000)
    # Import the data
    training = pandas.read_csv('train.csv')
    testing = pandas.read_csv('test.csv')

    # Sanity Check
    assert( np.all(training.columns[1:] == testing.columns) )

    # Make it a ndarray and remove training labels
    trainingData = training.as_matrix()
    xTrain = trainingData[:,1:]
    yTrain = trainingData[:,0]

    testingData = testing.as_matrix()

    stratifiedCV = StratifiedKFold(yTrain,10)

    scores = []
    pred = []
    for train,test in stratifiedCV:
        X_train,X_test,Y_train,Y_test =\
                xTrain[train,:],xTrain[test,:],yTrain[train,:],yTrain[test,:]
        model.fit(X_train,Y_train)
        accur = logloss(Y_test,model.predict_proba(X_test)[:,1])
        scores.append(accur)
        print(accur)
        pred.append(model.predict_proba(testingData)[:,1])

    meanScores = np.array(scores)
    print(meanScores.mean())
    finalPred = np.vstack(pred).mean(axis=0)

    def write_prediction(f):
        g = open('grad_prediction.csv','w')
        for i in f:
            g.write(str(i)+'\n')
        g.close()

    write_prediction(finalPred)
Esempio n. 19
0
                    show_progress=False)
    preds = dict(zip(list(q.state_names['target']), list(q.values)))
    for key in preds.keys():
        samp_sub[key][i] = preds[key]
        samp_sub['sig_id'][i] = test_df['sig_id'][i]

train_targets = pd.read_csv('train_targets_scored.csv')

scores = []
for i in range(len(samp_sub)):
    pred = list(np.array(samp_sub.iloc[i][1:]))
    act = list(
        np.array(train_targets[train_targets['sig_id'] ==
                               samp_sub['sig_id'].iloc[0]].drop('sig_id',
                                                                axis=1))[0])
    scores.append(logloss(act, pred))
overall_logloss = np.mean(scores)

network = []
for node in nodes_added:
    if len(model.get_children(node)) != 0:
        for child in model.get_children(node):
            network.append((node, child))

#make first log
'''
data = {'Network':  [network],
        'Nodes Count': [len(nodes_added)],
        'Edges Count': [len(network)],
        'Score': [overall_logloss]
        }
Esempio n. 20
0
def model1():
    # Some settings
    cores = 8
    bags = 40
    nClassifiers = 80
    classifiers = [
        ExtraTreesClassifier(
            n_estimators=bags, n_jobs=cores, criterion="gini", bootstrap=True, oob_score=True, max_depth=1
        ),
        ExtraTreesClassifier(
            n_estimators=bags, n_jobs=cores, criterion="entropy", bootstrap=True, oob_score=True, max_depth=1
        ),
        RandomForestClassifier(
            n_estimators=bags, n_jobs=cores, criterion="gini", bootstrap=True, oob_score=True, max_depth=1
        ),
        RandomForestClassifier(
            n_estimators=bags, n_jobs=cores, criterion="entropy", bootstrap=True, oob_score=True, max_depth=1
        ),
    ]

    # Import the data
    training = pandas.read_csv("train.csv")
    testing = pandas.read_csv("test.csv")

    # Sanity Check
    assert np.all(training.columns[1:] == testing.columns)

    # Make it a ndarray and remove training labels
    trainingData = training.as_matrix()
    xTrain = trainingData[:, 1:]
    yTrain = trainingData[:, 0]

    testingData = testing.as_matrix()

    # Stores the outputs of out trees
    trainingBag = []
    testingBag = []

    for _ in xrange(nClassifiers):
        for c in classifiers:
            c.fit(xTrain, yTrain)
            decisionFunc = c.oob_decision_function_[:, 1]
            trainingBag.append(decisionFunc)
            testingBag.append(c.predict_proba(testingData)[:, 1])

    # akes the list of arrays into a matrix
    trainingBag = np.vstack(trainingBag).T
    testingBag = np.vstack(testingBag).T

    # Grid searching over alpha

    stratifiedCV = StratifiedKFold(yTrain, 10)

    ansDict = {}
    ansDict["train"] = {}
    ansDict["test"] = {}
    for ind, a in enumerate(np.arange(-10, 1, 1)):
        ansDict["train"][ind] = []
        ansDict["test"][ind] = []
        for train, test in stratifiedCV:
            X_train, X_test, Y_train, Y_test = (
                trainingBag[train, :],
                trainingBag[test, :],
                yTrain[train, :],
                yTrain[test, :],
            )
            sgd = SGDClassifier(loss="log", penalty="l1", n_iter=10000, alpha=10 ** a, learning_rate="constant")
            sgd.fit(X_train, Y_train)
            tempPred = sgd.predict_proba(X_test)
            ansDict["train"][ind].append(logloss(Y_test, tempPred))
            ansDict["test"][ind].append(sgd.predict_proba(testingBag))

    meanScores = []
    for i, j in ansDict["train"].items():
        wut = np.array(j)
        meanScores.append(wut.mean())

    meanScores = np.array(meanScores)
    meanScores[meanScores < 0] = 1.0
    print(meanScores.min())
    paramGood = np.where(meanScores == meanScores.min())[0][0]
    testPred = ansDict["test"][paramGood]
    finalPred = np.vstack(testPred).mean(axis=0)

    def write_prediction(f):
        g = open("best_prediction.csv", "w")
        for i in f:
            g.write(str(i) + "\n")
        g.close()

    write_prediction(finalPred)