This would end up reducing my true positive rate also when looking at the ROC curve.
'''
'''Just an FYI that age is a vastly different scale than the rest of the variables.
I am showing the plot and considering scaling it.'''

newdata.age.plot(kind='hist', alpha=.3)

#Scaling age and fare.

from sklearn.preprocessing import RobustScaler

X_scaled = RobustScaler().fit_transform(X[['age', 'fare']])
X_scaled = pd.DataFrame(X_scaled, columns=['age', 'fare'], index=X.index)
#join with rest of Data

X_scaled = X_scaled.join(dummies)
X_scaled = X_scaled.join(X[['sibsp', 'parch']])
X.info()

#Train Test Split on Scaled Data...

X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(
    X_scaled, y, test_size=.25, stratify=y, random_state=31)

#Grid Search Logistic regression
from sklearn.cross_validation import StratifiedKFold
grid_lr_scaled = GridSearchCV(lr,
                              logreg_parameters,
                              cv=StratifiedKFold(y_train,
                                                 n_folds=5,
                                                 shuffle=True),
Exemple #2
0
       u'people', u'perfect', u'performance', u'performances', u'picture',
       u'place', u'played', u'plot', u'point', u'pretty', u'probably',
       u'quite', u'read', u'real', u'really', u'reason', u'right', u'role',
       u'said', u'saw', u'say', u'scene', u'scenes', u'score', u'screen',
       u'script', u'second', u'seeing', u'seen', u'sense', u'set',
       u'shows', u'simply', u'special', u'special effects', u'star',
       u'star wars', u'start', u'story', u'sure', u'takes', u'thats',
       u'theres', u'thing', u'things', u'think', u'thought', u'time',
       u'times', u'trilogy', u'true', u'truly', u'trying', u'understand',
       u'use', u'used', u'violence', u'want', u'war', u'wars', u'wasnt',
       u'watch', u'watched', u'watching', u'way', u'wife', u'wonderful',
       u'work', u'world', u'worth', 'year_tfidf', u'years', u'young']
X_prescale = X[features_to_scale]
X_scaled = RobustScaler().fit_transform(X_prescale)
X_scaled = pd.DataFrame(X_scaled, columns = features_to_scale, index = X_prescale.index)
X_final_scaled = X_scaled.join(X[features_to_not_scale])

X_final_scaled.info()
X.info()
#Train Test Split the scaled data

X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(X_final_scaled, y, test_size = .2, random_state = 31)

#So what is the baseline prediction?
print y.mean()
y.value_counts()

baseline_not10 = (1-y[y== 10].count()/float(y.count()))

'''There are at least two possibilities I can think of for testing with the Classifier:
'''

'''Just an FYI that age is a vastly different scale than the rest of the variables.
I am showing the plot and considering scaling it.'''

newdata.age.plot(kind = 'hist', alpha = .3)

#Scaling age and fare.

from sklearn.preprocessing import RobustScaler

X_scaled = RobustScaler().fit_transform(X[['age', 'fare']])
X_scaled = pd.DataFrame(X_scaled, columns = ['age', 'fare'], index = X.index)
#join with rest of Data

X_scaled = X_scaled.join(dummies)
X_scaled = X_scaled.join(X[['sibsp', 'parch']])
X.info()

#Train Test Split on Scaled Data...

X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_scaled, y, test_size = .25, stratify = y, random_state = 31)

#Grid Search Logistic regression
from sklearn.cross_validation import StratifiedKFold
grid_lr_scaled = GridSearchCV(lr, logreg_parameters, cv = StratifiedKFold(y_train, n_folds = 5, shuffle = True), n_jobs = -1, verbose = 1)
grid_lr_scaled.fit(X_train_scaled, y_train)

print grid_lr_scaled.best_estimator_
print grid_lr_scaled.best_params_