Example #1
0
def logistic_make_submission():
    train_start_date = '2016-03-10'
    train_end_date = '2016-04-11'
    test_start_date = '2016-04-11'
    test_end_date = '2016-04-16'

    sub_start_date = '2016-03-15'
    sub_end_date = '2016-04-16'

    user_index, training_data, label = make_train_set(train_start_date,
                                                      train_end_date,
                                                      test_start_date,
                                                      test_end_date)
    X_train, X_test, y_train, y_test = train_test_split(training_data.values,
                                                        label.values,
                                                        test_size=0.2,
                                                        random_state=0)

    y_train = list(map(int, y_train))
    # print(np.any(np.isnan(X_train)))
    # print(np.all(np.isfinite(X_train)))
    clf = lg()  # 使用类,参数全是默认的
    clf.fit(X_train, y_train)

    sub_user_index, sub_trainning_data = make_test_set(sub_start_date,
                                                       sub_end_date)

    y_hat = clf.predict(sub_trainning_data.values)
    sub_user_index['label'] = y_hat
    pred = sub_user_index[sub_user_index['label'] == 1]
    pred = pred[['user_id', 'sku_id']]
    pred = pred.groupby('user_id').first().reset_index()
    pred['user_id'] = pred['user_id'].astype(int)
    pred.to_csv('../sub/submissionLOG508.csv', index=False, index_label=False)
Example #2
0
def predict_user(id_one, id_two, embedding):
    user_one = User.query.filter_by(id=id_one).first()
    user_two = User.query.filter_by(id=id_two).first()
    if user_one is None or user_two is None:
        raise ValueError 
    user_one_embeddings = user_one.embeddings()
    user_two_embeddings = user_two.embeddings()
    embeddings = np.vstack([user_one_embeddings, user_two_embeddings])
    targets = np.concatenate([
        np.zeros(len(user_one_embeddings)), 
        np.ones(len(user_two_embeddings))
    ])
    result_key = {
        0: user_one.screen_name,
        1: user_two.screen_name
    }
    model = lg().fit(embeddings, targets)
    return result_key[model.predict([embedding])[0]]
Example #3
0
def train():
    print("load data ...")
    X, y = dp.onehot_process()
    print("finish loading data")

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.17,
                                                        random_state=1)

    np.save("X_train.npy", X_train)
    np.save("X_test.npy", X_test)
    np.save("y_train.npy", y_train)
    np.save("y_test.npy", y_test)

    model = lg(C=1e-3)
    print("train model ...")
    model.fit(X_train, y_train)
    print("finish training")

    # store the model
    joblib.dump(model, 'logistic_model.sav')
Example #4
0
#  converting non-numeric values of embarked to numeric
titanic_test.loc[titanic_test["Embarked"] == "S", "Embarked"] = 0
titanic_test.loc[titanic_test["Embarked"] == "C", "Embarked"] = 1
titanic_test.loc[titanic_test["Embarked"] == "Q", "Embarked"] = 2

# On to machine learning(building our model)
from sklearn.linear_model import LogisticRegression as lg
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import accuracy_score as acc
# import numpy as np

# The columns we'll use to predict the target
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

# Initialize our algorithm class
alg = lg()
model = alg.fit(titanic[predictors], titanic["Survived"])

train_predictors = titanic[predictors]

# The target we're using to train the algorithm.
train_target = titanic["Survived"]

scores = cross_val_score(model, train_predictors, train_target, cv=10)
print scores
print scores.mean()

predictions = alg.predict(titanic_test[predictors])
print predictions

# Create a new dataframe with only the columns Kaggle wants from the dataset.
Example #5
0
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Fitting logistic regression to the Training set
from sklearn.linear_model import LogisticRegression as lg
classifier = lg(random_state=0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10)
print("mean accuracy is", accuracies.mean())
print(accuracies.std())
Example #6
0
    trn_x, trn_y, val_x, val_y, tst_x, tst_y = tvt(X, Y)

    if do_pca:
        r = PCA(n_components=n_pca)
        trn_x = r.fit_transform(trn_x)
        val_x = r.transform(val_x)
        tst_x = r.transform(tst_x)
        acc[j, 3] = np.size(trn_x, axis=1)

    lgc = BeginClass()
    lgc.lst()

    # sr0 = np.zeros((10, 4))
    # a = 0
    for c in np.linspace(.0001, 5, 50):
        lgr = lg(penalty='l1', C=c)
    #     lgr = lg(C=c, kernel='linear')
    # for c in [1000]:
    #     lgr = ADA(n_estimators=c)
        lgc.appen(model=lgr, param=c, trnx=trn_x, trny=trn_y, valx=val_x, valy=val_y)
    ############
        # # Use for printing MSE figure for CV
        # lgr.fit(trn_x, trn_y)
        # sr0[a, 1] = modResid(lgr, trn_x, trn_y)[1] #returns the MSE
        # sr0[a, 2] = modResid(lgr, val_x, val_y)[1]
        # sr0[a, 3] = modResid(lgr, tst_x, tst_y)[1]
        # sr0[a, 0] = c
        # a += 1

    lgc.locate()
    c = lgc.param[lgc.plac]
Example #7
0
X = df.iloc[: , 1:-1].values
y = df.iloc[: , -1:].values

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Fitting logistic regression to the Training set
from sklearn.linear_model import LogisticRegression as lg
classifier = lg(random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print ("mean accuracy is",accuracies.mean())
print (accuracies.std())
clfrfc = RFC(n_estimators = 10 , criterion  = 'entropy', random_state = 0 )
clfrfc.fit(features_train , labels_train)

predrfc = clfrfc.predict(features_test)

Scorerfc= clfrfc.score(features_test , labels_test)
cmrfc = confusion_matrix(labels_test, predrfc)


#logistic regression



# Fitting logistic regression to the Training set
from sklearn.linear_model import LogisticRegression as lg
clflg = lg(random_state = 0)
clflg.fit(features_train , labels_train)

predlg = clflg.predict(features_test)

Scorelg= clflg.score(features_test , labels_test)

cmlg = confusion_matrix(labels_test, predlg)

#svm
from sklearn.svm import SVC
clfsvc = SVC(kernel = 'rbf' , random_state = 0)
clfsvc.fit(features_train , labels_train)

labels_pred = clfsvc.predict(features_test)
Example #9
0
    trn_x, trn_y, val_x, val_y, tst_x, tst_y = tvt(X, Y)

    if do_pca:
        r = PCA(n_components=n_pca)
        trn_x = r.fit_transform(trn_x)
        val_x = r.transform(val_x)
        tst_x = r.transform(tst_x)
        acc[j, 3] = np.size(trn_x, axis=1)

    lgc = BeginClass()
    lgc.lst()

    # sr0 = np.zeros((10, 4))
    # a = 0
    for c in np.linspace(.0001, 5, 50):
        lgr = lg(penalty='l1', C=c)
        #     lgr = lg(C=c, kernel='linear')
        # for c in [1000]:
        #     lgr = ADA(n_estimators=c)
        lgc.appen(model=lgr,
                  param=c,
                  trnx=trn_x,
                  trny=trn_y,
                  valx=val_x,
                  valy=val_y)
    ############
    # # Use for printing MSE figure for CV
    # lgr.fit(trn_x, trn_y)
    # sr0[a, 1] = modResid(lgr, trn_x, trn_y)[1] #returns the MSE
    # sr0[a, 2] = modResid(lgr, val_x, val_y)[1]
    # sr0[a, 3] = modResid(lgr, tst_x, tst_y)[1]
# ## Using train_test_split procedure
# ### Accuracy measure

# In[20]:

from sklearn.cross_validation import train_test_split
from sklearn import metrics
xtr, xtest, ytr, ytest = train_test_split(X, y, test_size=0.3)

# ### Logistic Regression

# In[21]:

# LogisticRegression
from sklearn.linear_model import LogisticRegression as lg
model = lg()
model.fit(xtr, ytr)
yPred = model.predict(xtest)
acc = metrics.accuracy_score(ytest, yPred)
model.fit(xtr, ytr)

# In[22]:

print acc

# ## Null accuracy
# * what is the percentage of maximum class

# In[23]:

null_acc = max(ytest.mean(), 1 - ytest.mean())
Example #11
0
#encode y to zeroes and ones
y=le().fit_transform(y)
"""
"""
#remove variable trap
x=x[:,1:]
"""
"""
#take some values as training and predict output of some test cases
x_train , x_test , y_train , y_test = tts(x,y,test_size=0.2,random_state=0)
"""
print("x:\n" ,x)

#to make linear regression
lin_reg=lg()
lin_reg.fit(x,y)

#to make polynomial regression
poly_reg=pf(degree=7)
x_poly=poly_reg.fit_transform(x)

lin_reg_2=lg()
lin_reg_2.fit(x_poly,y)

plt.scatter(x,y,color='red')
plt.plot(x,lin_reg.predict(x),color ='black')

x_grid=np.arange(min(x),max(x),0.01)
x_grid=x_grid.reshape(len(x_grid),1)
plt.scatter(x,y,color='red')
Example #12
0
#fill nan values by mean
"""x[: , 1: ]= sip(missing_values=np.nan,strategy='mean').fit_transform(x[: , 1: ])"""

#check dataset
print("dataset:\n" , dataset)

#encode x to zeroes and ones
"""x = ct([('Country', ohe(), [0])], remainder = 'passthrough').fit_transform(x)

#encode y to zeroes and ones
y=le().fit_transform(y)"""

#take some values as training and predict output of some test cases
x_train , x_test , y_train , y_test = tts(x,y,test_size=0.2,random_state=0)
LinReg=lg()
LinReg.fit(x_train,y_train)

y_predict=LinReg.predict(x_test)
y_predict_train=LinReg.predict(x_train)

plt.scatter(x_train,y_train,color='red')
plt.plot(x_train,y_predict_train,color='blue')
plt.plot(x_train,LinReg.predict(x_train),color='blue')
plt.show()

plt.scatter(x_test,y_test,color='black')
plt.plot(x_train,y_predict_train,color='orange')
plt.plot(x_train,LinReg.predict(x_train),color='blue')
plt.show()
Example #13
0
#         bin_set[1].append(train_set[1][x])

# sort out the 0s and 1s of test set
bin_test_set = test_set
# for x in range(10000):
#     if(test_set[1][x] <=1):
#         bin_test_set [0].append(test_set[0][x])
#         bin_test_set [1].append(test_set[1][x])

# show the image and label
#   for x in range(0,10):
#     print(bin_set[1][x])
#plt.imshow(bin_set[0][x].reshape((28, 28)), cmap=cm.Greys_r)
#plt.show()

logReg = lg(solver="lbfgs", multi_class="auto", max_iter=50000)

# print(len(bin_test_set[0]))

# print("part 1")

logReg.fit(bin_set[0], bin_set[1])

# print("part 2")

# # Predict one image
# for x in range(2115):
#     pred = logReg.predict(bin_test_set[0][x].reshape(1,-1))
#     if(pred != bin_test_set[1][x]):
#         print("prediction: {}".format(pred))
#         print("actual: {}".format(bin_test_set[1][x]))