def fit_model_1(self, lol = 0.0025, toWrite = False): model = SVC(probability = True, kernel = 'rbf', tol = 1e-3, gamma = 0.001, coef0 = 0.0) for data in self.cv_data: X_train, X_test, Y_train, Y_test = data model.fit(X_train,Y_train) pred = model.predict_proba(X_test)[:,1] print("Model 1 score: %f" % (logloss(Y_test,pred),)) if toWrite: f2 = open('model1/model.pkl','w') pickle.dump(model,f2)
def fit_model_22(self, lol = 2, toWrite = False): model = SVC(probability = True, kernel = 'sigmoid', tol = 1e-3, coef0 = lol) for data in self.cv_data: X_train, X_test, Y_train, Y_test = data model.fit(X_train,Y_train) pred = model.predict_proba(X_test)[:,1] print("Model 22 score: %f" % (logloss(Y_test,pred),)) if toWrite: f2 = open('model22/model.pkl','w') pickle.dump(model,f2)
def fit_model_20(self, lol = 0.0025, toWrite = False): model = SVC(probability = True, kernel = 'linear', class_weight = 'auto', tol = 1e-3) for data in self.cv_data: X_train, X_test, Y_train, Y_test = data model.fit(X_train,Y_train) pred = model.predict_proba(X_test)[:,1] print("Model 20 score: %f" % (logloss(Y_test,pred),)) if toWrite: f2 = open('model20/model.pkl','w') pickle.dump(model,f2)
def fit_model_16(self,toWrite=False): model = ARDRegression() for data in self.cv_data: X_train, X_test, Y_train, Y_test = data model.fit(X_train,Y_train) pred = model.predict(X_test) print("Model 16 score %f" % (logloss(Y_test,pred),)) if toWrite: f2 = open('model16/model.pkl','w') pickle.dump(model,f2) f2.close()
def fit_model_14(self,toWrite=False): model = OrthogonalMatchingPursuit() for data in self.cv_data: X_train, X_test, Y_train, Y_test = data model.fit(X_train,Y_train) pred = model.predict(X_test) print("Model 14 score %f" % (logloss(Y_test,pred),)) if toWrite: f2 = open('model14/model.pkl','w') pickle.dump(model,f2) f2.close()
def fit_model_12(self,toWrite=False): model = ElasticNet(alpha=1.0) for data in self.cv_data: X_train, X_test, Y_train, Y_test = data model.fit(X_train,Y_train) pred = model.predict(X_test) print("Model 12 score %f" % (logloss(Y_test,pred),)) if toWrite: f2 = open('model12/model.pkl','w') pickle.dump(model,f2) f2.close()
def fit_model_11(self,toWrite=False): model = LassoLars(alpha=1,max_iter=5000) for data in self.cv_data: X_train, X_test, Y_train, Y_test = data model.fit(X_train,Y_train) pred = model.predict(X_test) print("Model 11 score %f" % (logloss(Y_test,pred),)) if toWrite: f2 = open('model11/model.pkl','w') pickle.dump(model,f2) f2.close()
def fit_model_9(self,toWrite=False): model = GaussianNB() for data in self.cv_data: X_train, X_test, Y_train, Y_test = data model.fit(X_train,Y_train) pred = model.predict_proba(X_test)[:,1] print("Model 9 score %f" % (logloss(Y_test,pred),)) if toWrite: f2 = open('model9/model.pkl','w') pickle.dump(model,f2) f2.close()
def fit_model_8(self,lol = 0.0, toWrite=False): model = BernoulliNB(alpha = lol, binarize = 0.0) for data in self.cv_data: X_train, X_test, Y_train, Y_test = data model.fit(X_train,Y_train) pred = model.predict_proba(X_test)[:,1] print("Model 8 score %f" % (logloss(Y_test,pred),)) if toWrite: f2 = open('model8/model.pkl','w') pickle.dump(model,f2) f2.close()
def fit_model_7(self,toWrite=False): model = NuSVC(probability=True,kernel='linear') for data in self.cv_data: X_train, X_test, Y_train, Y_test = data model.fit(X_train,Y_train) pred = model.predict_proba(X_test)[:,1] print("Model 7 score %f" % (logloss(Y_test,pred),)) if toWrite: f2 = open('model7/model.pkl','w') pickle.dump(model,f2) f2.close()
def fit_model_4(self,toWrite=False): model = SVC(kernel='poly',probability=True, degree=2) for data in self.cv_data: X_train, X_test, Y_train, Y_test = data model.fit(X_train,Y_train) pred = model.predict_proba(X_test)[:,1] print("Model 4 score %f" % (logloss(Y_test,pred),)) if toWrite: f2 = open('model4/model.pkl','w') pickle.dump(model,f2) f2.close()
def fit_model_10(self,toWrite=False): model = BayesianRidge(n_iter=5000) for data in self.cv_data: X_train, X_test, Y_train, Y_test = data model.fit(X_train,Y_train) pred = model.predict(X_test) print("Model 10 score %f" % (logloss(Y_test,pred),)) if toWrite: f2 = open('model10/model.pkl','w') pickle.dump(model,f2) f2.close()
def fit_model_6(self,toWrite=False): model = RandomForestClassifier(n_estimators=2000,n_jobs=self.cpus) for data in self.cv_data: X_train, X_test, Y_train, Y_test = data X_train,Y_train = self.balance_data(X_train,Y_train) model.fit(X_train,Y_train) pred = model.predict_proba(X_test)[:,1] print("Model 6 score %f" % (logloss(Y_test,pred),)) if toWrite: f2 = open('model6/model.pkl','w') pickle.dump(model,f2) f2.close()
def fit_model_3(self, lol = 0.011,toWrite = True): model = SGDClassifier(penalty = 'l1', loss = 'log', n_iter = 50000, alpha = lol) for data in self.cv_data: X_train, X_test, Y_train, Y_test = data X_train,Y_train = self.balance_data(X_train,Y_train) model.fit(X_train,Y_train) pred = model.predict_proba(X_test) print("Model 3 score : %f" % (logloss(Y_test,pred),)) if toWrite: f2 = open('model3/model.pkl','w') pickle.dump(model,f2) f2.close()
def fit_model_2(self, lol = .07, toWrite = False): model = LogisticRegression(C = lol, penalty = 'l1', tol = 1e-6) for data in self.cv_data: X_train, X_test, Y_train, Y_test = data X_train,Y_train = self.balance_data(X_train,Y_train) model.fit(X_train,Y_train) pred = model.predict_proba(X_test)[:,1] print("Model 2 Score: %f" % (logloss(Y_test,pred),)) if toWrite: f2 = open('model2/model.pkl','w') pickle.dump(model,f2) f2.close()
def grid_searcher(self): X_train, X_test, Y_train, Y_test = self.cv_data[-1] X_train = np.vstack((X_train, X_test)) Y_train = np.concatenate((Y_train, Y_test)) stratifiedCV = StratifiedKFold(Y_train, 10) ansDict = {} ansDict["train"] = {} ansDict["test"] = {} C_range = 10.0 ** np.arange(-4, 9) gamma_range = 10.0 ** np.arange(-5, 4) for ind, i in enumerate(C_range): for jnd, j in enumerate(gamma_range): # Cantor's pairs dictInd = ((ind + jnd + 2) ** 2 + (ind + 1) - (jnd + 1)) / 2 ansDict["train"][dictInd] = [] ansDict["test"][dictInd] = [] for train, test in stratifiedCV: X_trainT, X_testT, Y_trainT, Y_testT = ( X_train[train, :], X_train[test, :], Y_train[train, :], Y_train[test, :], ) svc = SVC(kernel="rbf", C=i, gamma=j, probability=True, class_weight="auto") svc.fit(X_trainT, Y_trainT) ansDict["train"][dictInd].append(logloss(Y_trainT, svc.predict_proba(X_trainT)[:, 1])) ansDict["test"][dictInd].append(svc.predict_proba(self.testMat)[:, 1]) meanScores = [] for i, j in ansDict["train"].items(): wut = np.array(j) meanScores.append(wut.mean()) meanScores = np.array(meanScores) meanScores[meanScores < 0] = 1.0 print(meanScores.min()) paramGood = np.where(meanScores == meanScores.min())[0][0] testPred = ansDict["test"][paramGood] finalPred = np.vstack(testPred).mean(axis=0) def write_prediction(f): g = open("sc_prediction.csv", "w") for i in f: g.write(str(i) + "\n") g.close() write_prediction(finalPred)
def blend_models(self): folders = [ 'model1', 'model2', 'model3', 'model4', 'model6','model7','model8','model9', 'model10','model11','model12', 'model14', 'model15','model16','model18', 'model19','model20', 'model21', 'model22'] predict_insteads = (8,9,10,11,12,13) models = [] for folder in folders: model_hand = open(folder+'/'+'model.pkl','r') models.append(pickle.load(model_hand)) model_hand.close() for derp in self.cv_data: X_train, X_test, Y_train, Y_test = derp trainLen = Y_test.shape[0] modelLen = len(models) testLen = self.testMat.shape[0] trainBag = np.zeros([trainLen,modelLen],dtype=float) testBag = np.zeros([testLen,modelLen],dtype=float) for i in xrange(modelLen): model = models[i] if i in predict_insteads: model.predict_proba = model.predict trainPred = model.predict_proba(X_test) testPred = model.predict_proba(self.testMat) if len(trainPred.shape) > 1: trainPred = trainPred[:,1] testPred = testPred[:,1] trainBag[:,i] = trainPred testBag[:,i] = testPred rf = ExtraTreesClassifier(n_estimators=1000,n_jobs=self.cpus, oob_score=True, bootstrap=True,criterion='gini') rf.fit(trainBag,Y_test) print("Final score is %f" %(logloss(Y_test,rf.oob_decision_function_[:,1]))) test_final = rf.predict_proba(testBag)[:,1] pred_hand = open('prediction.csv','w') for row in test_final: pred_hand.write(str(row)+'\n') pred_hand.close()
def modelg(): # Some settings model = GradientBoostingClassifier(loss='deviance', subsample=.5, n_estimators=100000) # Import the data training = pandas.read_csv('train.csv') testing = pandas.read_csv('test.csv') # Sanity Check assert( np.all(training.columns[1:] == testing.columns) ) # Make it a ndarray and remove training labels trainingData = training.as_matrix() xTrain = trainingData[:,1:] yTrain = trainingData[:,0] testingData = testing.as_matrix() stratifiedCV = StratifiedKFold(yTrain,10) scores = [] pred = [] for train,test in stratifiedCV: X_train,X_test,Y_train,Y_test =\ xTrain[train,:],xTrain[test,:],yTrain[train,:],yTrain[test,:] model.fit(X_train,Y_train) accur = logloss(Y_test,model.predict_proba(X_test)[:,1]) scores.append(accur) print(accur) pred.append(model.predict_proba(testingData)[:,1]) meanScores = np.array(scores) print(meanScores.mean()) finalPred = np.vstack(pred).mean(axis=0) def write_prediction(f): g = open('grad_prediction.csv','w') for i in f: g.write(str(i)+'\n') g.close() write_prediction(finalPred)
show_progress=False) preds = dict(zip(list(q.state_names['target']), list(q.values))) for key in preds.keys(): samp_sub[key][i] = preds[key] samp_sub['sig_id'][i] = test_df['sig_id'][i] train_targets = pd.read_csv('train_targets_scored.csv') scores = [] for i in range(len(samp_sub)): pred = list(np.array(samp_sub.iloc[i][1:])) act = list( np.array(train_targets[train_targets['sig_id'] == samp_sub['sig_id'].iloc[0]].drop('sig_id', axis=1))[0]) scores.append(logloss(act, pred)) overall_logloss = np.mean(scores) network = [] for node in nodes_added: if len(model.get_children(node)) != 0: for child in model.get_children(node): network.append((node, child)) #make first log ''' data = {'Network': [network], 'Nodes Count': [len(nodes_added)], 'Edges Count': [len(network)], 'Score': [overall_logloss] }
def model1(): # Some settings cores = 8 bags = 40 nClassifiers = 80 classifiers = [ ExtraTreesClassifier( n_estimators=bags, n_jobs=cores, criterion="gini", bootstrap=True, oob_score=True, max_depth=1 ), ExtraTreesClassifier( n_estimators=bags, n_jobs=cores, criterion="entropy", bootstrap=True, oob_score=True, max_depth=1 ), RandomForestClassifier( n_estimators=bags, n_jobs=cores, criterion="gini", bootstrap=True, oob_score=True, max_depth=1 ), RandomForestClassifier( n_estimators=bags, n_jobs=cores, criterion="entropy", bootstrap=True, oob_score=True, max_depth=1 ), ] # Import the data training = pandas.read_csv("train.csv") testing = pandas.read_csv("test.csv") # Sanity Check assert np.all(training.columns[1:] == testing.columns) # Make it a ndarray and remove training labels trainingData = training.as_matrix() xTrain = trainingData[:, 1:] yTrain = trainingData[:, 0] testingData = testing.as_matrix() # Stores the outputs of out trees trainingBag = [] testingBag = [] for _ in xrange(nClassifiers): for c in classifiers: c.fit(xTrain, yTrain) decisionFunc = c.oob_decision_function_[:, 1] trainingBag.append(decisionFunc) testingBag.append(c.predict_proba(testingData)[:, 1]) # akes the list of arrays into a matrix trainingBag = np.vstack(trainingBag).T testingBag = np.vstack(testingBag).T # Grid searching over alpha stratifiedCV = StratifiedKFold(yTrain, 10) ansDict = {} ansDict["train"] = {} ansDict["test"] = {} for ind, a in enumerate(np.arange(-10, 1, 1)): ansDict["train"][ind] = [] ansDict["test"][ind] = [] for train, test in stratifiedCV: X_train, X_test, Y_train, Y_test = ( trainingBag[train, :], trainingBag[test, :], yTrain[train, :], yTrain[test, :], ) sgd = SGDClassifier(loss="log", penalty="l1", n_iter=10000, alpha=10 ** a, learning_rate="constant") sgd.fit(X_train, Y_train) tempPred = sgd.predict_proba(X_test) ansDict["train"][ind].append(logloss(Y_test, tempPred)) ansDict["test"][ind].append(sgd.predict_proba(testingBag)) meanScores = [] for i, j in ansDict["train"].items(): wut = np.array(j) meanScores.append(wut.mean()) meanScores = np.array(meanScores) meanScores[meanScores < 0] = 1.0 print(meanScores.min()) paramGood = np.where(meanScores == meanScores.min())[0][0] testPred = ansDict["test"][paramGood] finalPred = np.vstack(testPred).mean(axis=0) def write_prediction(f): g = open("best_prediction.csv", "w") for i in f: g.write(str(i) + "\n") g.close() write_prediction(finalPred)