Beispiel #1
0
def bowFitAndPrediction(predictData, textSeries, outcome,typeModel='binary'):
    print "Bag of words for %s" % (textSeries.name)
    
    if typeModel == 'continuous':
        bowModel = Ridge(alpha = 0.001)
    else:
        bowModel = LogisticRegression(penalty='l2',dual=False,tol=0.0001,fit_intercept=True, C=1, intercept_scaling=1, class_weight=None, random_state=423) 
    
    
    vectorizer = getFeatures(textSeries)
    
    X_train = vectorizer.transform(predictData)
        
    #Outcomes
    Y_train = outcome
    
    #Logistic regression, not sure if best
    bowModel.fit(X_train,Y_train)
    
    #Comment out later, fitting on CV data
    
    if typeModel == 'continuous':
        predict = bowModel.predict(X_train)
        yhat = predict
    else:
        predict = bowModel.predict_proba(X_train)
        yhat = predict[:,1]
    
    
    return (yhat, vectorizer, bowModel)
class AbstractLinearEstimator(BaseEstimator):
    def __init__(self, config, logger, model_type, version):
        super().__init__(config, logger, model_type, version)
        self.params = self.config["parameter"]["linear"]

    def fit_(
        self,
        X_train: pd.DataFrame,
        y_train: np.array,
        X_eval: pd.DataFrame = None,
        y_eval: np.array = None,
    ) -> None:
        self.logger.info(f"Model Type: {self.model_type}")
        self.logger.info(f"Linear Params: {self.params}")
        self.feature_names = X_train.columns.tolist()
        if self.objective == "clf":
            self.model = LR(**self.params)
        elif self.objective == "reg":
            self.model = Ridge(**self.params)
        self.model.fit(X_train, y_train)
        self.models.append(self.model)

    def predict_(self, X: np.array) -> np.array:
        if self.objective == "clf":
            return self.model.predict_proba(X)[:, 1]
        elif self.objective == "reg":
            return self.model.predict(X)
Beispiel #3
0
def bagofwords(X_train, X_cross, X_test, X_predict, X_eval, Y_train, Y_cross, Y_predict,variable = 'test', typeModel='binary',name='test'):

    X_train_text = X_train[variable]
    X_cross_text = X_cross[variable]
    X_test_text = X_test[variable]
    X_predict_text = X_predict[variable]
    X_eval_text = X_eval[variable]
    
    train_vec = getFeatures(X_train_text)
    X_train_text = train_vec.transform(X_train_text)
    X_cross_text = train_vec.transform(X_cross_text)
    X_test_text = train_vec.transform(X_test_text)
    
    predict_vec = getFeatures(X_predict_text)
    X_predict_text = predict_vec.transform(X_predict_text)
    X_eval_text = predict_vec.transform(X_eval_text)
   
    if typeModel == 'continuous':
        bowModel = Ridge(alpha = 0.001)
        bowModel2 = Ridge(alpha = 0.001)
        bowModel.fit(X_train_text,Y_train)
        bowModel2.fit(X_predict_text,Y_predict)
        
        inScore = r2_score(Y_train,bowModel.predict(X_train_text))
        print "Train Ridge: r2 score is %f" % (inScore)
        
        inScore = r2_score(Y_cross,bowModel.predict(X_cross_text))
        print "Cross Ridge: r2 score is %f" % (inScore)
        
        X_train[name] = bowModel.predict(X_train_text)
        X_cross[name] = bowModel.predict(X_cross_text)
        X_test[name] = bowModel.predict(X_test_text)
        X_predict[name] = bowModel2.predict(X_test_text)
        X_eval[name] = bowModel2.predict(X_eval_text)
        
    else:
        bowModel = LogisticRegression(penalty='l2',dual=False,tol=0.0001,fit_intercept=True, C=0.0005, intercept_scaling=1, class_weight=None, random_state=423) 
        bowModel2 = LogisticRegression(penalty='l2',dual=False,tol=0.0001,fit_intercept=True, C=.0005, intercept_scaling=1, class_weight=None, random_state=423) 
        bowModel.fit(X_train_text,Y_train)
        bowModel2.fit(X_predict_text,Y_predict)
        
        inScore = roc_auc_score(Y_train,bowModel.predict_proba(X_train_text)[:,1])
        print "Train Logistic: Area under auc curve is %f" % (inScore)
        
        inScore = roc_auc_score(Y_cross,bowModel.predict_proba(X_cross_text)[:,1])
        print "Cross Logistic: Area under auc curve is %f" % (inScore)
        
        X_train[name] = bowModel.predict_proba(X_train_text)[:,1]
        X_cross[name] = bowModel.predict_proba(X_cross_text)[:,1]
        X_test[name] = bowModel.predict_proba(X_test_text)[:,1]
        X_predict[name] = bowModel2.predict_proba(X_predict_text)[:,1]
        X_eval[name] = bowModel2.predict_proba(X_eval_text)[:,1]
        
    return X_train, X_cross, X_test, X_predict, X_eval
Beispiel #4
0
model.fit(X_train_2, y_train_2)
print('[{}] Finished to train ridge (2)'.format(time.time() - start_time))
ridge_preds2 = model.predict(X_train_1)
ridge_preds2f = model.predict(sparse_merge_test)
print('[{}] Finished to predict ridge (2)'.format(time.time() - start_time))
ridge_preds_oof = np.concatenate((ridge_preds2, ridge_preds1), axis=0)
ridge_preds_test = (ridge_preds1f + ridge_preds2f) / 2.0
print('RMSLE OOF: {}'.format(rmse(ridge_preds_oof, y_train)))
if not SUBMIT_MODE:
    print('RMSLE TEST: {}'.format(rmse(ridge_preds_test, y_test)))


model = MultinomialNB(alpha=0.01)
model.fit(X_train_1, y_train_1 >= 4)
print('[{}] Finished to train MNB (1)'.format(time.time() - start_time))
mnb_preds1 = model.predict_proba(X_train_2)[:, 1]
mnb_preds1f = model.predict_proba(sparse_merge_test)[:, 1]
print('[{}] Finished to predict MNB (1)'.format(time.time() - start_time))
model = MultinomialNB(alpha=0.01)
model.fit(X_train_2, y_train_2 >= 4)
print('[{}] Finished to train MNB (2)'.format(time.time() - start_time))
mnb_preds2 = model.predict_proba(X_train_1)[:, 1]
mnb_preds2f = model.predict_proba(sparse_merge_test)[:, 1]
print('[{}] Finished to predict MNB (2)'.format(time.time() - start_time))
mnb_preds_oof = np.concatenate((mnb_preds2, mnb_preds1), axis=0)
mnb_preds_test = (mnb_preds1f + mnb_preds2f) / 2.0


del ridge_preds1
del ridge_preds1f
del ridge_preds2
Beispiel #5
0
roc

test.shape
x_train.shape
# Extract features from all text articles in data

pt_test=pd.DataFrame({'p_test':p_test})
pt_test.dtypes
pt_test.head(5)
roc_auc_score(y_test,pt_test)

#chk=pd.concat([ID,p_test],axis=1)
mydata=pd.DataFrame({'Complaint ID':ID,
                     'Consumer disputed?':pt_test['p_test']})
p_test=ridge.predict(test)
p=ridge.predict_proba(test)
 del p
mydata.to_csv('D:\mohit gate\edvancer python\project 1\Lassoproject1prediction.csv')

    
cutoffs=np.linspace(0.35,0.50)
train_score=ridge.predict(x_test)[:,1]
real=y_test

KS_all=[]
for cutoff in cutoffs:
    predicted=(train_score>cutoff).astype(int)
    
    TP=((predicted==1)&(real==1)).sum()
    TN=((predicted==0)&(real==0)).sum()
    FP=((predicted==1)&(real==0)).sum()