def logistic_make_submission(): train_start_date = '2016-03-10' train_end_date = '2016-04-11' test_start_date = '2016-04-11' test_end_date = '2016-04-16' sub_start_date = '2016-03-15' sub_end_date = '2016-04-16' user_index, training_data, label = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date) X_train, X_test, y_train, y_test = train_test_split(training_data.values, label.values, test_size=0.2, random_state=0) y_train = list(map(int, y_train)) # print(np.any(np.isnan(X_train))) # print(np.all(np.isfinite(X_train))) clf = lg() # 使用类,参数全是默认的 clf.fit(X_train, y_train) sub_user_index, sub_trainning_data = make_test_set(sub_start_date, sub_end_date) y_hat = clf.predict(sub_trainning_data.values) sub_user_index['label'] = y_hat pred = sub_user_index[sub_user_index['label'] == 1] pred = pred[['user_id', 'sku_id']] pred = pred.groupby('user_id').first().reset_index() pred['user_id'] = pred['user_id'].astype(int) pred.to_csv('../sub/submissionLOG508.csv', index=False, index_label=False)
def predict_user(id_one, id_two, embedding): user_one = User.query.filter_by(id=id_one).first() user_two = User.query.filter_by(id=id_two).first() if user_one is None or user_two is None: raise ValueError user_one_embeddings = user_one.embeddings() user_two_embeddings = user_two.embeddings() embeddings = np.vstack([user_one_embeddings, user_two_embeddings]) targets = np.concatenate([ np.zeros(len(user_one_embeddings)), np.ones(len(user_two_embeddings)) ]) result_key = { 0: user_one.screen_name, 1: user_two.screen_name } model = lg().fit(embeddings, targets) return result_key[model.predict([embedding])[0]]
def train(): print("load data ...") X, y = dp.onehot_process() print("finish loading data") X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.17, random_state=1) np.save("X_train.npy", X_train) np.save("X_test.npy", X_test) np.save("y_train.npy", y_train) np.save("y_test.npy", y_test) model = lg(C=1e-3) print("train model ...") model.fit(X_train, y_train) print("finish training") # store the model joblib.dump(model, 'logistic_model.sav')
# converting non-numeric values of embarked to numeric titanic_test.loc[titanic_test["Embarked"] == "S", "Embarked"] = 0 titanic_test.loc[titanic_test["Embarked"] == "C", "Embarked"] = 1 titanic_test.loc[titanic_test["Embarked"] == "Q", "Embarked"] = 2 # On to machine learning(building our model) from sklearn.linear_model import LogisticRegression as lg from sklearn.cross_validation import cross_val_score from sklearn.metrics import accuracy_score as acc # import numpy as np # The columns we'll use to predict the target predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"] # Initialize our algorithm class alg = lg() model = alg.fit(titanic[predictors], titanic["Survived"]) train_predictors = titanic[predictors] # The target we're using to train the algorithm. train_target = titanic["Survived"] scores = cross_val_score(model, train_predictors, train_target, cv=10) print scores print scores.mean() predictions = alg.predict(titanic_test[predictors]) print predictions # Create a new dataframe with only the columns Kaggle wants from the dataset.
# Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) # Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # Fitting logistic regression to the Training set from sklearn.linear_model import LogisticRegression as lg classifier = lg(random_state=0) classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) # Applying k-Fold Cross Validation from sklearn.model_selection import cross_val_score accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10) print("mean accuracy is", accuracies.mean()) print(accuracies.std())
trn_x, trn_y, val_x, val_y, tst_x, tst_y = tvt(X, Y) if do_pca: r = PCA(n_components=n_pca) trn_x = r.fit_transform(trn_x) val_x = r.transform(val_x) tst_x = r.transform(tst_x) acc[j, 3] = np.size(trn_x, axis=1) lgc = BeginClass() lgc.lst() # sr0 = np.zeros((10, 4)) # a = 0 for c in np.linspace(.0001, 5, 50): lgr = lg(penalty='l1', C=c) # lgr = lg(C=c, kernel='linear') # for c in [1000]: # lgr = ADA(n_estimators=c) lgc.appen(model=lgr, param=c, trnx=trn_x, trny=trn_y, valx=val_x, valy=val_y) ############ # # Use for printing MSE figure for CV # lgr.fit(trn_x, trn_y) # sr0[a, 1] = modResid(lgr, trn_x, trn_y)[1] #returns the MSE # sr0[a, 2] = modResid(lgr, val_x, val_y)[1] # sr0[a, 3] = modResid(lgr, tst_x, tst_y)[1] # sr0[a, 0] = c # a += 1 lgc.locate() c = lgc.param[lgc.plac]
X = df.iloc[: , 1:-1].values y = df.iloc[: , -1:].values # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0) # Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # Fitting logistic regression to the Training set from sklearn.linear_model import LogisticRegression as lg classifier = lg(random_state = 0) classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) # Applying k-Fold Cross Validation from sklearn.model_selection import cross_val_score accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10) print ("mean accuracy is",accuracies.mean()) print (accuracies.std())
clfrfc = RFC(n_estimators = 10 , criterion = 'entropy', random_state = 0 ) clfrfc.fit(features_train , labels_train) predrfc = clfrfc.predict(features_test) Scorerfc= clfrfc.score(features_test , labels_test) cmrfc = confusion_matrix(labels_test, predrfc) #logistic regression # Fitting logistic regression to the Training set from sklearn.linear_model import LogisticRegression as lg clflg = lg(random_state = 0) clflg.fit(features_train , labels_train) predlg = clflg.predict(features_test) Scorelg= clflg.score(features_test , labels_test) cmlg = confusion_matrix(labels_test, predlg) #svm from sklearn.svm import SVC clfsvc = SVC(kernel = 'rbf' , random_state = 0) clfsvc.fit(features_train , labels_train) labels_pred = clfsvc.predict(features_test)
trn_x, trn_y, val_x, val_y, tst_x, tst_y = tvt(X, Y) if do_pca: r = PCA(n_components=n_pca) trn_x = r.fit_transform(trn_x) val_x = r.transform(val_x) tst_x = r.transform(tst_x) acc[j, 3] = np.size(trn_x, axis=1) lgc = BeginClass() lgc.lst() # sr0 = np.zeros((10, 4)) # a = 0 for c in np.linspace(.0001, 5, 50): lgr = lg(penalty='l1', C=c) # lgr = lg(C=c, kernel='linear') # for c in [1000]: # lgr = ADA(n_estimators=c) lgc.appen(model=lgr, param=c, trnx=trn_x, trny=trn_y, valx=val_x, valy=val_y) ############ # # Use for printing MSE figure for CV # lgr.fit(trn_x, trn_y) # sr0[a, 1] = modResid(lgr, trn_x, trn_y)[1] #returns the MSE # sr0[a, 2] = modResid(lgr, val_x, val_y)[1] # sr0[a, 3] = modResid(lgr, tst_x, tst_y)[1]
# ## Using train_test_split procedure # ### Accuracy measure # In[20]: from sklearn.cross_validation import train_test_split from sklearn import metrics xtr, xtest, ytr, ytest = train_test_split(X, y, test_size=0.3) # ### Logistic Regression # In[21]: # LogisticRegression from sklearn.linear_model import LogisticRegression as lg model = lg() model.fit(xtr, ytr) yPred = model.predict(xtest) acc = metrics.accuracy_score(ytest, yPred) model.fit(xtr, ytr) # In[22]: print acc # ## Null accuracy # * what is the percentage of maximum class # In[23]: null_acc = max(ytest.mean(), 1 - ytest.mean())
#encode y to zeroes and ones y=le().fit_transform(y) """ """ #remove variable trap x=x[:,1:] """ """ #take some values as training and predict output of some test cases x_train , x_test , y_train , y_test = tts(x,y,test_size=0.2,random_state=0) """ print("x:\n" ,x) #to make linear regression lin_reg=lg() lin_reg.fit(x,y) #to make polynomial regression poly_reg=pf(degree=7) x_poly=poly_reg.fit_transform(x) lin_reg_2=lg() lin_reg_2.fit(x_poly,y) plt.scatter(x,y,color='red') plt.plot(x,lin_reg.predict(x),color ='black') x_grid=np.arange(min(x),max(x),0.01) x_grid=x_grid.reshape(len(x_grid),1) plt.scatter(x,y,color='red')
#fill nan values by mean """x[: , 1: ]= sip(missing_values=np.nan,strategy='mean').fit_transform(x[: , 1: ])""" #check dataset print("dataset:\n" , dataset) #encode x to zeroes and ones """x = ct([('Country', ohe(), [0])], remainder = 'passthrough').fit_transform(x) #encode y to zeroes and ones y=le().fit_transform(y)""" #take some values as training and predict output of some test cases x_train , x_test , y_train , y_test = tts(x,y,test_size=0.2,random_state=0) LinReg=lg() LinReg.fit(x_train,y_train) y_predict=LinReg.predict(x_test) y_predict_train=LinReg.predict(x_train) plt.scatter(x_train,y_train,color='red') plt.plot(x_train,y_predict_train,color='blue') plt.plot(x_train,LinReg.predict(x_train),color='blue') plt.show() plt.scatter(x_test,y_test,color='black') plt.plot(x_train,y_predict_train,color='orange') plt.plot(x_train,LinReg.predict(x_train),color='blue') plt.show()
# bin_set[1].append(train_set[1][x]) # sort out the 0s and 1s of test set bin_test_set = test_set # for x in range(10000): # if(test_set[1][x] <=1): # bin_test_set [0].append(test_set[0][x]) # bin_test_set [1].append(test_set[1][x]) # show the image and label # for x in range(0,10): # print(bin_set[1][x]) #plt.imshow(bin_set[0][x].reshape((28, 28)), cmap=cm.Greys_r) #plt.show() logReg = lg(solver="lbfgs", multi_class="auto", max_iter=50000) # print(len(bin_test_set[0])) # print("part 1") logReg.fit(bin_set[0], bin_set[1]) # print("part 2") # # Predict one image # for x in range(2115): # pred = logReg.predict(bin_test_set[0][x].reshape(1,-1)) # if(pred != bin_test_set[1][x]): # print("prediction: {}".format(pred)) # print("actual: {}".format(bin_test_set[1][x]))