def bowFitAndPrediction(predictData, textSeries, outcome,typeModel='binary'): print "Bag of words for %s" % (textSeries.name) if typeModel == 'continuous': bowModel = Ridge(alpha = 0.001) else: bowModel = LogisticRegression(penalty='l2',dual=False,tol=0.0001,fit_intercept=True, C=1, intercept_scaling=1, class_weight=None, random_state=423) vectorizer = getFeatures(textSeries) X_train = vectorizer.transform(predictData) #Outcomes Y_train = outcome #Logistic regression, not sure if best bowModel.fit(X_train,Y_train) #Comment out later, fitting on CV data if typeModel == 'continuous': predict = bowModel.predict(X_train) yhat = predict else: predict = bowModel.predict_proba(X_train) yhat = predict[:,1] return (yhat, vectorizer, bowModel)
class AbstractLinearEstimator(BaseEstimator): def __init__(self, config, logger, model_type, version): super().__init__(config, logger, model_type, version) self.params = self.config["parameter"]["linear"] def fit_( self, X_train: pd.DataFrame, y_train: np.array, X_eval: pd.DataFrame = None, y_eval: np.array = None, ) -> None: self.logger.info(f"Model Type: {self.model_type}") self.logger.info(f"Linear Params: {self.params}") self.feature_names = X_train.columns.tolist() if self.objective == "clf": self.model = LR(**self.params) elif self.objective == "reg": self.model = Ridge(**self.params) self.model.fit(X_train, y_train) self.models.append(self.model) def predict_(self, X: np.array) -> np.array: if self.objective == "clf": return self.model.predict_proba(X)[:, 1] elif self.objective == "reg": return self.model.predict(X)
def bagofwords(X_train, X_cross, X_test, X_predict, X_eval, Y_train, Y_cross, Y_predict,variable = 'test', typeModel='binary',name='test'): X_train_text = X_train[variable] X_cross_text = X_cross[variable] X_test_text = X_test[variable] X_predict_text = X_predict[variable] X_eval_text = X_eval[variable] train_vec = getFeatures(X_train_text) X_train_text = train_vec.transform(X_train_text) X_cross_text = train_vec.transform(X_cross_text) X_test_text = train_vec.transform(X_test_text) predict_vec = getFeatures(X_predict_text) X_predict_text = predict_vec.transform(X_predict_text) X_eval_text = predict_vec.transform(X_eval_text) if typeModel == 'continuous': bowModel = Ridge(alpha = 0.001) bowModel2 = Ridge(alpha = 0.001) bowModel.fit(X_train_text,Y_train) bowModel2.fit(X_predict_text,Y_predict) inScore = r2_score(Y_train,bowModel.predict(X_train_text)) print "Train Ridge: r2 score is %f" % (inScore) inScore = r2_score(Y_cross,bowModel.predict(X_cross_text)) print "Cross Ridge: r2 score is %f" % (inScore) X_train[name] = bowModel.predict(X_train_text) X_cross[name] = bowModel.predict(X_cross_text) X_test[name] = bowModel.predict(X_test_text) X_predict[name] = bowModel2.predict(X_test_text) X_eval[name] = bowModel2.predict(X_eval_text) else: bowModel = LogisticRegression(penalty='l2',dual=False,tol=0.0001,fit_intercept=True, C=0.0005, intercept_scaling=1, class_weight=None, random_state=423) bowModel2 = LogisticRegression(penalty='l2',dual=False,tol=0.0001,fit_intercept=True, C=.0005, intercept_scaling=1, class_weight=None, random_state=423) bowModel.fit(X_train_text,Y_train) bowModel2.fit(X_predict_text,Y_predict) inScore = roc_auc_score(Y_train,bowModel.predict_proba(X_train_text)[:,1]) print "Train Logistic: Area under auc curve is %f" % (inScore) inScore = roc_auc_score(Y_cross,bowModel.predict_proba(X_cross_text)[:,1]) print "Cross Logistic: Area under auc curve is %f" % (inScore) X_train[name] = bowModel.predict_proba(X_train_text)[:,1] X_cross[name] = bowModel.predict_proba(X_cross_text)[:,1] X_test[name] = bowModel.predict_proba(X_test_text)[:,1] X_predict[name] = bowModel2.predict_proba(X_predict_text)[:,1] X_eval[name] = bowModel2.predict_proba(X_eval_text)[:,1] return X_train, X_cross, X_test, X_predict, X_eval
model.fit(X_train_2, y_train_2) print('[{}] Finished to train ridge (2)'.format(time.time() - start_time)) ridge_preds2 = model.predict(X_train_1) ridge_preds2f = model.predict(sparse_merge_test) print('[{}] Finished to predict ridge (2)'.format(time.time() - start_time)) ridge_preds_oof = np.concatenate((ridge_preds2, ridge_preds1), axis=0) ridge_preds_test = (ridge_preds1f + ridge_preds2f) / 2.0 print('RMSLE OOF: {}'.format(rmse(ridge_preds_oof, y_train))) if not SUBMIT_MODE: print('RMSLE TEST: {}'.format(rmse(ridge_preds_test, y_test))) model = MultinomialNB(alpha=0.01) model.fit(X_train_1, y_train_1 >= 4) print('[{}] Finished to train MNB (1)'.format(time.time() - start_time)) mnb_preds1 = model.predict_proba(X_train_2)[:, 1] mnb_preds1f = model.predict_proba(sparse_merge_test)[:, 1] print('[{}] Finished to predict MNB (1)'.format(time.time() - start_time)) model = MultinomialNB(alpha=0.01) model.fit(X_train_2, y_train_2 >= 4) print('[{}] Finished to train MNB (2)'.format(time.time() - start_time)) mnb_preds2 = model.predict_proba(X_train_1)[:, 1] mnb_preds2f = model.predict_proba(sparse_merge_test)[:, 1] print('[{}] Finished to predict MNB (2)'.format(time.time() - start_time)) mnb_preds_oof = np.concatenate((mnb_preds2, mnb_preds1), axis=0) mnb_preds_test = (mnb_preds1f + mnb_preds2f) / 2.0 del ridge_preds1 del ridge_preds1f del ridge_preds2
roc test.shape x_train.shape # Extract features from all text articles in data pt_test=pd.DataFrame({'p_test':p_test}) pt_test.dtypes pt_test.head(5) roc_auc_score(y_test,pt_test) #chk=pd.concat([ID,p_test],axis=1) mydata=pd.DataFrame({'Complaint ID':ID, 'Consumer disputed?':pt_test['p_test']}) p_test=ridge.predict(test) p=ridge.predict_proba(test) del p mydata.to_csv('D:\mohit gate\edvancer python\project 1\Lassoproject1prediction.csv') cutoffs=np.linspace(0.35,0.50) train_score=ridge.predict(x_test)[:,1] real=y_test KS_all=[] for cutoff in cutoffs: predicted=(train_score>cutoff).astype(int) TP=((predicted==1)&(real==1)).sum() TN=((predicted==0)&(real==0)).sum() FP=((predicted==1)&(real==0)).sum()