/
main.py
89 lines (73 loc) · 3.69 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import pandas
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold
from sklearn import cross_validation
import numpy as np
def clean_data(titanic, titanic_mean_age):
titanic["Age"] = titanic["Age"].fillna(titanic_mean_age)
titanic["Fare"] = titanic["Fare"].fillna(titanic_test["Fare"].median())
titanic.loc[titanic["Sex"] == "male", "Sex"] = 0
titanic.loc[titanic["Sex"] == "female", "Sex"] = 1
# Find all the unique values for "Embarked".
titanic["Embarked"] = titanic["Embarked"].fillna('S')
titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0
titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1
titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2
#print(titanic["Embarked"].unique())
return titanic
def linear_regression(predictors, titanic):
# Initialize our algorithm class
alg = LinearRegression()
# Generate cross validation folds for the titanic dataset. It return the row indices corresponding to train and test.
# We set random_state to ensure we get the same splits every time we run this.
kf = KFold(titanic.shape[0], n_folds=3, random_state=1)
predictions = []
for train, test in kf:
# The predictors we're using the train the algorithm. Note how we only take the rows in the train folds.
train_predictors = (titanic[predictors].iloc[train,:])
# The target we're using to train the algorithm.
train_target = titanic["Survived"].iloc[train]
# Training the algorithm using the predictors and target.
alg.fit(train_predictors, train_target)
# We can now make predictions on the test fold
test_predictions = alg.predict(titanic[predictors].iloc[test,:])
predictions.append(test_predictions)
# The predictions are in three separate numpy arrays. Concatenate them into one.
# We concatenate them on axis 0, as they only have one axis.
predictions = np.concatenate(predictions, axis=0)
# Map predictions to outcomes (only possible outcomes are 1 and 0)
predictions[predictions > .5] = 1
predictions[predictions <=.5] = 0
accuracy_list = [x == y for x, y in zip(titanic["Survived"], predictions)]
num_acc = sum(accuracy_list)
accuracy = sum(accuracy_list) / len(accuracy_list)
accuracy = accuracy.item()
def logistic_regression(predictors, titanic, titanic_test):
# Initialize our algorithm
alg = LogisticRegression(random_state=1)
# Compute the accuracy score for all the cross validation folds. (much simpler than what we did before!)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)
# Take the mean of the scores (because we have one for each fold)
print(scores.mean())
# Train the algorithm using all the training data
alg.fit(titanic[predictors], titanic["Survived"])
# Make predictions using the test set.
predictions = alg.predict(titanic_test[predictors])
def create_submission(titanic, predictions):
# Create a new dataframe with only the columns Kaggle wants from the dataset.
submission = pandas.DataFrame({
"PassengerId": titanic["PassengerId"],
"Survived": predictions
})
submission.to_csv("data/kaggle.csv", index=False)
# ==================================================================================
titanic = pandas.read_csv("data/train.csv")
titanic_test = pandas.read_csv("data/test.csv")
titanic_mean_age = titanic["Age"].median()
titanic = clean_data(titanic, titanic_mean_age)
titanic_test = clean_data(titanic_test, titanic_mean_age)
# The columns we'll use to predict the target
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
predictions = logistic_regression(predictors, titanic, titanic_test)
create_submission(titanic_test, predictions)