This would end up reducing my true positive rate also when looking at the ROC curve. ''' '''Just an FYI that age is a vastly different scale than the rest of the variables. I am showing the plot and considering scaling it.''' newdata.age.plot(kind='hist', alpha=.3) #Scaling age and fare. from sklearn.preprocessing import RobustScaler X_scaled = RobustScaler().fit_transform(X[['age', 'fare']]) X_scaled = pd.DataFrame(X_scaled, columns=['age', 'fare'], index=X.index) #join with rest of Data X_scaled = X_scaled.join(dummies) X_scaled = X_scaled.join(X[['sibsp', 'parch']]) X.info() #Train Test Split on Scaled Data... X_train_scaled, X_test_scaled, y_train, y_test = train_test_split( X_scaled, y, test_size=.25, stratify=y, random_state=31) #Grid Search Logistic regression from sklearn.cross_validation import StratifiedKFold grid_lr_scaled = GridSearchCV(lr, logreg_parameters, cv=StratifiedKFold(y_train, n_folds=5, shuffle=True),
u'people', u'perfect', u'performance', u'performances', u'picture', u'place', u'played', u'plot', u'point', u'pretty', u'probably', u'quite', u'read', u'real', u'really', u'reason', u'right', u'role', u'said', u'saw', u'say', u'scene', u'scenes', u'score', u'screen', u'script', u'second', u'seeing', u'seen', u'sense', u'set', u'shows', u'simply', u'special', u'special effects', u'star', u'star wars', u'start', u'story', u'sure', u'takes', u'thats', u'theres', u'thing', u'things', u'think', u'thought', u'time', u'times', u'trilogy', u'true', u'truly', u'trying', u'understand', u'use', u'used', u'violence', u'want', u'war', u'wars', u'wasnt', u'watch', u'watched', u'watching', u'way', u'wife', u'wonderful', u'work', u'world', u'worth', 'year_tfidf', u'years', u'young'] X_prescale = X[features_to_scale] X_scaled = RobustScaler().fit_transform(X_prescale) X_scaled = pd.DataFrame(X_scaled, columns = features_to_scale, index = X_prescale.index) X_final_scaled = X_scaled.join(X[features_to_not_scale]) X_final_scaled.info() X.info() #Train Test Split the scaled data X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(X_final_scaled, y, test_size = .2, random_state = 31) #So what is the baseline prediction? print y.mean() y.value_counts() baseline_not10 = (1-y[y== 10].count()/float(y.count())) '''There are at least two possibilities I can think of for testing with the Classifier:
''' '''Just an FYI that age is a vastly different scale than the rest of the variables. I am showing the plot and considering scaling it.''' newdata.age.plot(kind = 'hist', alpha = .3) #Scaling age and fare. from sklearn.preprocessing import RobustScaler X_scaled = RobustScaler().fit_transform(X[['age', 'fare']]) X_scaled = pd.DataFrame(X_scaled, columns = ['age', 'fare'], index = X.index) #join with rest of Data X_scaled = X_scaled.join(dummies) X_scaled = X_scaled.join(X[['sibsp', 'parch']]) X.info() #Train Test Split on Scaled Data... X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_scaled, y, test_size = .25, stratify = y, random_state = 31) #Grid Search Logistic regression from sklearn.cross_validation import StratifiedKFold grid_lr_scaled = GridSearchCV(lr, logreg_parameters, cv = StratifiedKFold(y_train, n_folds = 5, shuffle = True), n_jobs = -1, verbose = 1) grid_lr_scaled.fit(X_train_scaled, y_train) print grid_lr_scaled.best_estimator_ print grid_lr_scaled.best_params_