Example #1
0
def setTrainDataAndMakeModel(X_train,Y_train,X_test):
    clf = MultinomialNB(alpha=125535, class_prior=None, fit_prior=True)
    calibrated_clf = CalibratedClassifierCV(clf, method='isotonic', cv=5)
    calibrated_clf.fit(X_train, Y_train)
    ypreds = calibrated_clf.predict_proba(X_test)    
    return ypreds
    
Example #2
0
    def move_bias(self, data_matrix, estimator=None, nu=.5, cv=2):
        '''
            move bias until nu of data_matrix are in the negative class
            then use scikits calibrate to calibrate self.estimator around the input
        '''
        #  move bias
        # l = [(estimator.decision_function(g)[0], g) for g in data_matrix]
        # l.sort(key=lambda x: x[0])
        # element = int(len(l) * nu)
        # estimator.intercept_ -= l[element][0]

        scores = [estimator.decision_function(sparse_vector)[0]
                  for sparse_vector in data_matrix]
        scores_sorted = sorted(scores)
        pivot = scores_sorted[int(len(scores_sorted) * self.nu)]
        estimator.intercept_ -= pivot

        # calibrate
        if self.move_bias_recalibrate:
            # data_matrix_binary = vstack([a[1] for a in l])
            # data_y = numpy.asarray([0] * element + [1] * (len(l) - element))
            data_y = numpy.asarray([1 if score >= pivot else -1 for score in scores])
            self.testimator = SGDClassifier(loss='log')
            self.testimator.fit(data_matrix, data_y)
            # estimator = CalibratedClassifierCV(estimator, cv=cv, method='sigmoid')
            estimator = CalibratedClassifierCV(self.testimator, cv=cv, method='sigmoid')
            estimator.fit(data_matrix, data_y)
        return estimator
Example #3
0
def simple_model(data, test):
    targets = data.target
    X, tX, y, ty = train_test_split(data.drop("target", axis=1), 
                                              targets, 
                                              test_size=0.2,
                                              random_state=2016)
                                              
    
    predictions = []
    
    print("\n\nTraining")
    # Sklearn GBM
    clf = RandomForestClassifier(n_estimators=2500,  
                                 max_depth=2,
                                 random_state=2015)
                                     
    cal = CalibratedClassifierCV(clf, cv=5, method="isotonic")
    cal.fit(X,y)
    
    pred = cal.predict_proba(tX)[:,1]
    print("\n\tValidation for Calibrated RFC")
    print("\t", log_loss(ty, pred))
    print("\t", roc_auc_score(ty, pred))
    
    # ens["gbm"] = pred
    predictions.append(cal.predict_proba(test)[:,1])
    
    predictions = sum(predictions)/len(predictions)
    
    return predictions
def calibrate_probs(y_val, prob_val, prob_test, n_folds=2, method='isotonic', random_state=5968):
    """ Calling from R:

        suppressMessages(library("rPython")) # Load RPython
        python.load("path/to/util_rpython.py")

        data.pred.calib <- python.call('calibrate_probs',
                                   y_val=y_val, # Actual values from validation
                                   prob_val=pred_val, # Predicted values from validation
                                   prob_test=pred_test) # Predicted values from test

        # data.pred.calib will be a list, so to get the calibrated predictions for each value we do:
        calib_pred_val = data.pred.calib$val
        calib_pred_test = data.pred.calib$test

    """

    y_val = np.asarray(y_val, dtype=float)
    prob_val = np.asarray(prob_val, dtype=float).reshape((-1, 1))
    prob_test = np.asarray(prob_test, dtype=float).reshape((-1, 1))

    prob_clb_val = np.zeros(len(y_val))
    prob_clb_test = np.zeros(len(prob_test))

    kf_val_full = KFold(len(y_val), n_folds=n_folds, random_state=random_state)

    for ix_train, ix_test in kf_val_full:
        kf_val_inner = KFold(len(ix_train), n_folds=n_folds, random_state=random_state)
        clf = CalibratedClassifierCV(method=method, cv=kf_val_inner)
        clf.fit(prob_val[ix_train], y_val[ix_train])
        prob_clb_val[ix_test] = clf.predict_proba(prob_val[ix_test])[:, 1]
        prob_clb_test += clf.predict_proba(prob_test)[:, 1]/n_folds

    return {'val': list(prob_clb_val), 'test': list(prob_clb_test)}
def setTrainTestDataAndCheckModel(X_train,Y_train,X_test,Y_test):
    model = RandomForestClassifier(125)
    model.fit(X_train,Y_train)
    '''
    clf = GridSearchCV(model,{'n_estimators':[100,125,150]},verbose=1)
    
    clf.fit(X_train,Y_train)
    print(clf.best_score_)
    print(clf.best_params_)    
    
    output = model.predict(X_test)
    print "-------------------RFC-----------------------"
    #print accuracy_score(Y_test,output)
    #print "%.2f" % log_loss(Y_test,output, eps=1e-15, normalize=True)
    
    ypreds = model.predict_proba(X_test)
    print "%.2f" % log_loss(Y_test,ypreds, eps=1e-15, normalize=True)

    
    clfbag = BaggingClassifier(model, n_estimators=5)
    clfbag.fit(X_train, Y_train)
    ypreds = clfbag.predict(X_test)    
    #print accuracy_score(Y_test,ypreds)    
    
    ypreds = clfbag.predict_proba(X_test)
    print "%.2f" % log_loss(Y_test,ypreds, eps=1e-15, normalize=True)
    '''
    calibrated_clf = CalibratedClassifierCV(model, method='isotonic', cv=5)
    calibrated_clf.fit(X_train, Y_train)
    #ypreds = calibrated_clf.predict(X_test)
    #print accuracy_score(Y_test,ypreds)
    
    ypreds = calibrated_clf.predict_proba(X_test)
    print "%.2f" % log_loss(Y_test, ypreds, eps=1e-15, normalize=True)
def test_sample_weight_warning():
    n_samples = 100
    X, y = make_classification(n_samples=2 * n_samples, n_features=6,
                               random_state=42)

    sample_weight = np.random.RandomState(seed=42).uniform(size=len(y))
    X_train, y_train, sw_train = \
        X[:n_samples], y[:n_samples], sample_weight[:n_samples]
    X_test = X[n_samples:]

    for method in ['sigmoid', 'isotonic']:
        base_estimator = LinearSVC(random_state=42)
        calibrated_clf = CalibratedClassifierCV(base_estimator, method=method)
        # LinearSVC does not currently support sample weights but they
        # can still be used for the calibration step (with a warning)
        msg = "LinearSVC does not support sample_weight."
        assert_warns_message(
            UserWarning, msg,
            calibrated_clf.fit, X_train, y_train, sample_weight=sw_train)
        probs_with_sw = calibrated_clf.predict_proba(X_test)

        # As the weights are used for the calibration, they should still yield
        # a different predictions
        calibrated_clf.fit(X_train, y_train)
        probs_without_sw = calibrated_clf.predict_proba(X_test)

        diff = np.linalg.norm(probs_with_sw - probs_without_sw)
        assert_greater(diff, 0.1)
Example #7
0
def svm_boost_calib_scale(x,y,x_test,seed):
    # normalize x+x_test
    x_rows = x.shape[0]
    X = preprocessing.scale(np.vstack((x,x_test)))
    x = X[:x_rows,:]
    x_test = X[x_rows:, :]
    print x.shape
    print x_test.shape

    model = SVC(probability=True, class_weight='auto', random_state=seed,
        C= 100,gamma=0.0)
    boosted = AdaBoostClassifier(model, random_state=seed)
    # avg CV AUC PLS
    cv = StratifiedKFold(y, n_folds=10, random_state=seed)
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    calib = CalibratedClassifierCV(boosted, cv=10, method='isotonic')
    for i, (train, test) in enumerate(cv):
        probas_ = calib.fit(x[train], y[train]).predict_proba(x[test])
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
    mean_tpr /= len(cv)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    print('Training set 10CV AUC:\n{}'.format(mean_auc))
    # return probs
    #model = SVC(probability=True, random_state=seed)
    #model = model.fit(x, y)
    probs = np.average([cls.predict_proba(x_test) for cls in calib.calibrated_classifiers_], axis=0)
    print probs.shape
    #print('Training set acc:\n{}'.format(model2.score(x, y)))
    #bids_test_probs = model2.predict_proba(x_test)
    return probs
Example #8
0
def calibrate(X_val, y_val, estimator):

    clf = CalibratedClassifierCV(base_estimator=estimator, 
                                method='isotonic', cv='prefit')

    clf.fit(X_val, y_val)
    return clf
Example #9
0
    def predict(self, X, thres=0.5, return_proba=True):
        """

        Predict class for X.
        The predicted class of an input sample is a vote by the trees in
        the forest, weighted by their probability estimates. That is,
        the predicted class is the one with highest mean probability
        estimate across the trees.

        """

        if self._model == 'svc_lin':
            from sklearn.base import clone
            from sklearn.calibration import CalibratedClassifierCV
            clf = CalibratedClassifierCV(clone(self._estimator).set_param(
                **self._estimator.get_param()))
            train_y = self._Xtrain[[self._rate_column]].values.ravel().tolist()
            self._estimator = clf.fit(self._Xtrain, train_y)

        proba = np.array(self._estimator.predict_proba(X))

        if proba.shape[1] > 2:
            pred = (proba > thres).astype(int)
        else:
            pred = (proba[:, 1] > thres).astype(int)

        if return_proba:
            return proba, pred

        return pred
Example #10
0
    def internal_processing(self, X, y, X_test):
        """
        """  
        Xs = np.hsplit(X, 5)
        Xts = np.hsplit(X_test, 5)
        Xts_cal = []
        
        for i in range(len(Xs)):           
            Xts_cal.append(calibrate(Xs[i], y, Xts[i]))
         
        XX_test = np.hstack(Xts_cal)   
        
        ec = EC(n_preds=5)
        ec.fit(X, y)
        y_ens = ec.predict_proba(XX_test)
#        y_pred = ec.predict_proba(X_test)
        
        #validation
        yv = ec.predict_proba(X)
        print 'Weights: %s' %(ec.w)
        print 'Validation log-loss: %s' %(logloss_mc(y, yv))
        
        cc = CalibratedClassifierCV(base_estimator=EC(n_preds=5), 
                                    method='isotonic', cv=10)
                                    
        cc.fit(X, y)
        y_cal = cc.predict_proba(XX_test)
        
        y_pred = (y_ens + y_cal)/2.
         
        return y_pred       
Example #11
0
def svm_calib(x, y, x_test, seed):
    model = SVC(probability=True, random_state=seed)
    # avg CV AUC PLS
    cv = StratifiedKFold(y, n_folds=10, random_state=seed)
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    calib = CalibratedClassifierCV(model, cv=10, method='isotonic')
    for i, (train, test) in enumerate(cv):
        probas_ = calib.fit(x[train], y[train]).predict_proba(x[test])
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
    mean_tpr /= len(cv)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    print('Training set 10CV AUC:\n{}'.format(mean_auc))
    # return probs
    #model = SVC(probability=True, random_state=seed)
    #model = model.fit(x, y)
    probs = np.average([cls.predict_proba(x_test) for cls in calib.calibrated_classifiers_], axis=0)
    print probs.shape
    #print('Training set acc:\n{}'.format(model2.score(x, y)))
    #bids_test_probs = model2.predict_proba(x_test)
    return probs
    def get_score(self, params):
        params['n_estimators'] = int(params['n_estimators'])
        params['max_depth'] = int(params['max_depth'])
        params['min_samples_split'] = int(params['min_samples_split'])
        params['min_samples_leaf'] = int(params['min_samples_leaf'])
        params['n_estimators'] = int(params['n_estimators'])

        print('Training with params:')
        print(params)

        # cross validation here
        scores = []
        for train_ix, test_ix in makeKFold(5, self.y, 1):
            X_train, y_train = self.X[train_ix, :], self.y[train_ix]
            X_test, y_test = self.X[test_ix, :], self.y[test_ix]
            weight = y_train.shape[0] / (2 * np.bincount(y_train))
            sample_weight = np.array([weight[i] for i in y_train])

            clf = RandomForestClassifier(**params)
            cclf = CalibratedClassifierCV(base_estimator=clf,
                                          method='isotonic',
                                          cv=makeKFold(3, y_train, 1))
            cclf.fit(X_train, y_train, sample_weight)
            pred = cclf.predict(X_test)
            scores.append(f1_score(y_true=y_test, y_pred=pred))

        print(scores)
        score = np.mean(scores)

        print(score)
        return {'loss': -score, 'status': STATUS_OK}
def train_model_rfc_calibrated (features, labels) :
	# First, set aside a some of the training set for calibration
	# Use stratified shuffle split so that class ratios are maintained after the split
	splitter = StratifiedShuffleSplit(labels, n_iter = 1, train_size = 0.7, random_state = 30)

	# Length is 1 in this case since we have a single fold for splitting
	print (len(splitter))

	for train_idx, calib_idx in splitter:
		features_train, features_calib = features[train_idx], features[calib_idx]
		labels_train, labels_calib = labels[train_idx], labels[calib_idx]

	print ("features_train shape: ", features_train.shape)
	print ("features_calib shape: ", features_calib.shape)
	print ("labels_train shape: ", labels_train.shape)
	print ("labels_calib shape: ", labels_calib.shape)
		
	print ("Performing Grid Search ...")
	# params_dict = {'criterion': ['entropy'], 'n_estimators':[30, 35, 40, 45], 'max_depth':[5, 6], 'min_samples_leaf': [1, 2, 5], 'min_samples_split': [2, 5, 10]}
	params_dict = {'criterion': ['entropy'], 'n_estimators':[60, 70, 80, 90], 'max_depth':[5, 6], 'min_samples_leaf': [1, 2, 5], 'min_samples_split': [2, 5, 10], 'max_features' : [6, 7, 8]}
	clf = GridSearchCV(rfc(random_state = 30, n_jobs = 4), params_dict, scoring = 'roc_auc', cv = 5)
	clf.fit(features_train, labels_train)

	print ("Best estimator: ", clf.best_estimator_)
	print ("Best best scores: %.4f" %(clf.best_score_))
	# print ("Best grid scores: ", clf.grid_scores_)

	# Perform calibration 
	# Use 'sigmoid' because sklearn cautions against using 'isotonic' for lesser than 1000 calibration samples as it can result in overfitting
	print ("Performing Calibration now ...")
	sigmoid = CalibratedClassifierCV(clf, cv='prefit', method='sigmoid')
	sigmoid.fit(features_calib, labels_calib)
	return sigmoid
Example #14
0
def svc_test2():
    """
    Submission:
    E_val:
    E_in:
    E_out:
    """
    from sklearn.preprocessing import StandardScaler
    from sklearn.svm import SVC
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.calibration import CalibratedClassifierCV

    X, y = dataset.load_train()

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    svc = SVC(kernel='linear', class_weight='auto', cache_size=10240)
    svc.fit(X_scaled, y)

    isotonic = CalibratedClassifierCV(svc, cv=StratifiedKFold(y, 5),
                                      method='isotonic')
    isotonic.fit(X_scaled, y)

    logger.debug('Got best isotonic CalibratedClassifier.')
    logger.debug('E_in (isotonic): %f', Util.auc_score(isotonic, X_scaled, y))
Example #15
0
def train(model_id,train_x,train_y,valid_x,valid_y,test_x):
    train_x,train_y=shuffle(train_x,train_y)


    random_state=random.randint(0, 1000000)
    print('random state: {state}'.format(state=random_state))

    clf = RandomForestClassifier(bootstrap=False, class_weight=None,
            criterion='entropy', max_depth=29008, max_features=36,
            max_leaf_nodes=None, min_samples_leaf=5, min_samples_split=3,
            min_weight_fraction_leaf=0.0, n_estimators=4494, n_jobs=8,
            oob_score=False, random_state=979271, verbose=0,
            warm_start=False)

    clf.fit(train_x, train_y)

    ccv = CalibratedClassifierCV(base_estimator=clf,method="sigmoid",cv="prefit")
    ccv.fit(valid_x,valid_y)

    valid_predictions = ccv.predict_proba(valid_x)
    test_predictions= ccv.predict_proba(test_x)

    loss = test(valid_y,valid_predictions,True)
    if  loss<0.52:
        data.saveData(valid_predictions,"../valid_results/valid_"+str(model_id)+".csv")
        data.saveData(test_predictions,"../results/results_"+str(model_id)+".csv")
Example #16
0
def main():
    X, Y, encoder, scale = load_train_data('train.csv')
    estimators = 500
    X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=0.2, random_state=0)
    X_train_real, X_test_real, Y_train_real, Y_test_real = train_test_split(X_train, Y_train, test_size=0.2, random_state=42)
    log.info('Loaded training file')
    X_test, _ = load_csv_file('test.csv', cut_end=False)
    log.info('Loaded test file')

    #Classifier Setup
    tree_clf = ExtraTreesClassifier(n_estimators=estimators, n_jobs=-1,
                                    random_state=42, max_depth=55, min_samples_split=1)

    clf = make_pipeline(TfidfTransformer(), DenseTransformer(), tree_clf)
    log.info('Fitting GradientBoost')
    clf.fit(X_train_real, Y_train_real)
    clf_probs = clf.predict_proba(X_test_real)
    score = log_loss(Y_test_real, clf_probs)
    log.info('Log Loss score un-trained = %f' % score)
    # Calibrate Classifier using ground truth in X,Y_valid
    sig_clf = CalibratedClassifierCV(clf, method="isotonic", cv="prefit")
    log.info('Fitting CalibratedClassifierCV')
    sig_clf.fit(X_valid, Y_valid)
    sig_clf_probs = sig_clf.predict_proba(X_test_real)
    sig_score = log_loss(Y_test_real, sig_clf_probs)
    log.info('Log loss score trained = %f' % sig_score)

    # Ok lets predict the test data with our funky new classifier
    sig_submission_probs = sig_clf.predict_proba(X_test)

    write_out_submission(sig_submission_probs, 'submission.csv')
Example #17
0
 def prepare_model(self, obj_fn=None, num_steps=None, model_params=None, batch_size: int = None):
     model = CalibratedClassifierCV(KNeighborsClassifier(**model_params), method="sigmoid")
     model_clf = model.fit(self.ds[self.data_groups["data_train_group"]].to_ndarray(),
                           self.ds[self.data_groups["target_train_group"]].to_ndarray())
     cal_model = CalibratedClassifierCV(model_clf, method="sigmoid", cv="prefit")
     cal_model.fit(self.ds[self.data_groups["data_validation_group"]].to_ndarray(),
                   self.ds[self.data_groups["target_validation_group"]].to_ndarray())
     return self.ml_model(cal_model)
    def train_test(self, X, y, X_test):
        """
        """
        sss = StratifiedShuffleSplit(y, 1, test_size=0.5)    
        for train_id, valid_id in sss:
            X0, X1 = X[train_id], X[valid_id]
            y0, y1 = y[train_id], y[valid_id]  
            
        #First half
        
        w0 = np.zeros(len(y0))
        for i in range(len(w0)):
            w0[i] = self.w[int(y0[i])]
        xg0_train = DMatrix(X0, label=y0, weight=w0)  
        xg0_test = DMatrix(X1, label=y1)   
        xgt_test = DMatrix(X_test)
        bst0 = my_train_xgboost(self.param, xg0_train, self.num_round)
        y0_pred = bst0.predict(xg0_test).reshape(X1.shape[0], 9)
        yt_pred = bst0.predict(xgt_test).reshape(X_test.shape[0], 9)
        
        #Calibrated RF
        rf = RandomForestClassifier(n_estimators=600, criterion='gini', 
                class_weight='auto', max_features='auto')
        cal = CalibratedClassifierCV(rf, method='isotonic', cv=3)
        cal.fit(X0, y0)
        y0_cal = cal.predict_proba(X1)
        yt_cal = cal.predict_proba(X_test)
        
        #Second half
        ss = StandardScaler()
        y0_pred = ss.fit_transform(y0_pred)
        yt_pred = ss.fit_transform(yt_pred)
        y0_cal = ss.fit_transform(y0_cal)
        yt_cal = ss.fit_transform(yt_cal)
        X1 = np.hstack((X1, y0_pred, y0_cal))
        X_test = np.hstack((X_test, yt_pred, yt_cal))  
        w1 = np.zeros(len(y1))
        
#        self.param['eta'] = 0.01
        self.num_round = 450

        for i in range(len(w1)):
            w1[i] = self.w[int(y1[i])]
        xg1_train = DMatrix(X1, label=y1, weight=w1)    
        xg_test= DMatrix(X_test)
        bst1 = my_train_xgboost(self.param, xg1_train, self.num_round)
        y_pred = bst1.predict(xg_test).reshape(X_test.shape[0], 9)
        
        return y_pred






                    
        
def get_model(params, X, y):
    clf = RandomForestClassifier(**params)
    cclf = CalibratedClassifierCV(base_estimator=clf,
                                  method='isotonic',
                                  cv=makeKFold(3, y, 1))
    weight = y.shape[0] / (2 * np.bincount(y))
    sample_weight = np.array([weight[i] for i in y])
    cclf.fit(X, y, sample_weight)
    return cclf
    def train_validate(self, X_train, y_train, X_valid, y_valid):
        """
        """
        sss = StratifiedShuffleSplit(y_train, 1, test_size=0.5)    
        for train_id, valid_id in sss:
            X0_train, X1_train = X_train[train_id], X_train[valid_id]
            y0_train, y1_train = y_train[train_id], y_train[valid_id]  
            
        #First half
       
        w0_train = np.zeros(len(y0_train))
        for i in range(len(w0_train)):
            w0_train[i] = self.w[int(y0_train[i])]
        xg0_train = DMatrix(X0_train, label=y0_train, weight=w0_train)  
        xg0_valid = DMatrix(X1_train, label=y1_train)   
        xgv_valid = DMatrix(X_valid, label=y_valid)
        watchlist = [(xg0_train,'train'), (xg0_valid, 'validation0')]
        
#        bst0 = train(self.param, xg0_train, self.num_round, watchlist)
        bst0 = my_train_xgboost(self.param, xg0_train, self.num_round, watchlist)
        y0_pred = bst0.predict(xg0_valid).reshape(X1_train.shape[0], 9)
        yv_pred = bst0.predict(xgv_valid).reshape(X_valid.shape[0], 9)
        
        #Calibrated RF
        rf = RandomForestClassifier(n_estimators=600, criterion='gini', 
                                    class_weight='auto', max_features='auto')
        cal = CalibratedClassifierCV(rf, method='isotonic', cv=3)        
        cal.fit(X0_train, y0_train)
        y0_cal = cal.predict_proba(X1_train)
        yv_cal = cal.predict_proba(X_valid)
        
        #Second half
        ss = StandardScaler()
        y0_pred = ss.fit_transform(y0_pred)
        yv_pred = ss.fit_transform(yv_pred)
        y0_cal = ss.fit_transform(y0_cal)
        yv_cal = ss.fit_transform(yv_cal)
        X1_train = np.hstack((X1_train, y0_pred, y0_cal))
        X_valid = np.hstack((X_valid, yv_pred, yv_cal))        
        w1_train = np.zeros(len(y1_train))
        
#        self.param['eta'] = 0.05
        self.num_round = 450

        for i in range(len(w1_train)):
            w1_train[i] = self.w[int(y1_train[i])]
        xg1_train = DMatrix(X1_train, label=y1_train, weight=w1_train)    
        xg_valid = DMatrix(X_valid, label=y_valid)
        watchlist = [(xg1_train,'train'), (xg_valid, 'validation')]
        
#        bst1 = train(self.param, xg1_train, self.num_round, watchlist)
        bst1 = my_train_xgboost(self.param, xg1_train, self.num_round, watchlist)
        y_pred = bst1.predict(xg_valid).reshape(X_valid.shape[0], 9)

#        pdb.set_trace()
        return y_pred
Example #21
0
 def prepare_model(self, obj_fn=None, num_steps=None, model_params=None, batch_size: int = None):
     if model_params is None:
         model_params = dict(n_estimators=25, min_samples_split=2)
     model = CalibratedClassifierCV(RandomForestClassifier(**model_params), method="sigmoid")
     model_clf = model.fit(self.ds[self.data_groups["data_train_group"]].to_ndarray(),
                           self.ds[self.data_groups["target_train_group"]].to_ndarray())
     cal_model = CalibratedClassifierCV(model_clf, method="sigmoid", cv="prefit")
     cal_model.fit(self.ds[self.data_groups["data_validation_group"]].to_ndarray(),
                   self.ds[self.data_groups["target_validation_group"]].to_ndarray())
     return self.ml_model(cal_model)
Example #22
0
def test_calibration_prob_sum():
    # Test that sum of probabilities is 1. A non-regression test for
    # issue #7796
    num_classes = 2
    X, y = make_classification(n_samples=10, n_features=5,
                               n_classes=num_classes)
    clf = LinearSVC(C=1.0)
    clf_prob = CalibratedClassifierCV(clf, method="sigmoid", cv=LeaveOneOut())
    clf_prob.fit(X, y)

    probs = clf_prob.predict_proba(X)
    assert_array_almost_equal(probs.sum(axis=1), np.ones(probs.shape[0]))
Example #23
0
def test_calibration_nan_imputer():
    """Test that calibration can accept nan"""
    X, y = make_classification(n_samples=10, n_features=2,
                               n_informative=2, n_redundant=0,
                               random_state=42)
    X[0, 0] = np.nan
    clf = Pipeline(
        [('imputer', SimpleImputer()),
         ('rf', RandomForestClassifier(n_estimators=1))])
    clf_c = CalibratedClassifierCV(clf, cv=2, method='isotonic')
    clf_c.fit(X, y)
    clf_c.predict(X)
Example #24
0
def trainrf(model_id,train_x,train_y,valid_x,valid_y,test_x):
    train_x,train_y=shuffle(train_x,train_y)


    random_state=random.randint(0, 1000000)
    print('random state: {state}'.format(state=random_state))

    clf = RandomForestClassifier(n_estimators=random.randint(50,5000),
                                 criterion='gini',
                                 max_depth=random.randint(10,1000),
                                 min_samples_split=random.randint(2,50),
                                 min_samples_leaf=random.randint(1,10),
                                 min_weight_fraction_leaf=random.uniform(0.0,0.5),
                                 max_features=random.uniform(0.1,1.0),
                                 max_leaf_nodes=random.randint(1,10),
                                 bootstrap=False,
                                 oob_score=False,
                                 n_jobs=30,
                                 random_state=random_state,
                                 verbose=0,
                                 warm_start=True,
                                 class_weight=None
                )

    clf.fit(train_x, train_y)

    valid_predictions1 = clf.predict_proba(valid_x)
    test_predictions1= clf.predict_proba(test_x)

    t1 = test(valid_y,valid_predictions1)

    ccv = CalibratedClassifierCV(base_estimator=clf,method="sigmoid",cv='prefit')
    ccv.fit(valid_x,valid_y)

    valid_predictions2 = ccv.predict_proba(valid_x)
    test_predictions2= ccv.predict_proba(test_x)

    t2 = test(valid_y,valid_predictions2)

    if t2<t1:
        valid_predictions=valid_predictions2
        test_predictions=test_predictions2
        t=t2
    else:
        valid_predictions=valid_predictions1
        test_predictions=test_predictions1
        t=t1

    if t < 0.450:
        data.saveData(valid_predictions,"../valid_results/valid_"+str(model_id)+".csv")
        data.saveData(test_predictions,"../results/results_"+str(model_id)+".csv")
Example #25
0
def hold_out_evaluation(classifier, x, y, test_size=0.2, calibrate=False):
    x_train, y_train, x_valid, y_valid = stratified_split(x, y, test_size)

    # Train
    if calibrate:
        # Make training and calibration
        calibrated_classifier = CalibratedClassifierCV(classifier, method='isotonic', cv=get_cv(y_train))
        fitted_classifier = calibrated_classifier.fit(x_train, y_train)
    else:
        fitted_classifier = classifier.fit(x_train, y_train)
    # Evaluate
    score = log_loss(y_valid, fitted_classifier.predict_proba(x_valid))

    return score
Example #26
0
def get_model(params, X, y_array, y_ix, reps):
    y = y_array[:, y_ix]
    params['bootstrap'] = False
    params['oob_score'] = False
    params['n_jobs'] = -1

    clf = RandomForestClassifier(**params)
    cclf = CalibratedClassifierCV(base_estimator=clf,
                                  method='isotonic',
                                  cv=makeKFold(3, y, reps))
    weight = y.shape[0] / (2 * np.bincount(y))
    sample_weight = np.array([weight[i] for i in y])
    cclf.fit(X, y, sample_weight)
    return cclf
def setTrainDataAndMakeModel(X_train,Y_train,X_test):
    model =  RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=27, max_features='log2', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)       

    model.fit(X_train,Y_train)
    calibrated_clf = CalibratedClassifierCV(model, method='isotonic', cv=5)
    calibrated_clf.fit(X_train, Y_train)
    ypreds = calibrated_clf.predict_proba(X_test)
    return ypreds
    
Example #28
0
def gb_calib_scale(x,y,x_test,seed):
    # normalize x+x_test
    x_rows = x.shape[0]
    X = preprocessing.scale(np.vstack((x,x_test)))
    x = X[:x_rows,:]
    x_test = X[x_rows:, :]
    print x.shape
    print x_test.shape
    model0 = SVC(probability=True, class_weight='auto', random_state=seed,
        C=1,gamma=0.1)
    # f*****g bugs in sklearn    
    class WrapClassifier:
        def __init__(self, est):
            self.est = est
        def predict(self, X):
            return self.est.predict_proba(X)[:,1][:,np.newaxis]
        def fit(self, X, y, sample_weight):
            self.est.fit(X, y, sample_weight)
    model01 = WrapClassifier(model0)            
    model1 = GradientBoostingClassifier(max_depth=3, random_state=seed, learning_rate=0.1, n_estimators=100, max_features=400)
    model = BaggingClassifier(model1, n_jobs=-1, random_state=seed)
    # avg CV AUC PLS
    n_folds = 10
    cv = StratifiedKFold(y, n_folds=n_folds, random_state=seed)
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    probs_list = []
    calib = CalibratedClassifierCV(model, cv=n_folds, method='sigmoid')
    for i, (train, test) in enumerate(cv):
        probas_ = calib.fit(x[train], y[train]).predict_proba(x[test])
        #probas_ = model.fit(x[train], y[train]).predict_proba(x[test])
        #probs_list.append(model.predict_proba(x_test))
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
    mean_tpr /= len(cv)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    print('Training set 10CV AUC:\n{}'.format(mean_auc))
    # return probs
    #model = SVC(probability=True, random_state=seed)
    #model = model.fit(x, y)
    #probs = np.average(probs_list, axis=0)
    probs = np.average([cls.predict_proba(x_test) for cls in calib.calibrated_classifiers_], axis=0)
    print probs.shape
    #print('Training set acc:\n{}'.format(model2.score(x, y)))
    #bids_test_probs = model2.predict_proba(x_test)
    return probs
Example #29
0
def svm_boost_isotonic_scale_train(x, y, x_test, seed):
    x_rows = x.shape[0]
    X = preprocessing.scale(np.vstack((x,x_test)))
    x = X[:x_rows,:]
    x_test = X[x_rows:, :]

    #x_tr, x_val, y_tr, y_val = train_test_split(
    #    x, y, test_size=0.1, random_state=seed)

    model = SVC(probability=True, class_weight='auto', random_state=seed)
    boosted = AdaBoostClassifier(model, random_state=seed)
    calib = CalibratedClassifierCV(boosted, cv=2, method='isotonic')
    calib.fit(x, y)
    probs = calib.predict_proba(x_test)
    return probs
Example #30
0
def predict_proba(clfs,X,y,X_test,weights,calibartion=False):
    skf = StratifiedKFold(y, n_folds=5,random_state=571)
    n = len(clfs)
    preds = []
    for clf in clfs:
        if calibartion == True:
            clf = CalibratedClassifierCV(clf,method="isotonic",cv=skf)
        clf.fit(X,y)
        y_pred = clf.predict_proba(X_test)
        preds.append(y_pred)
        
    final_pred = preds.pop(0)
    for pred,weight in zip(preds,weights):
        final_pred += weight * pred
    final_pred = final_pred/np.array(weights).sum()
    return final_pred
Example #31
0
File: run.py Project: k-ivey/FastSK
    args.t,
    args.approx,
    args.I,
    args.delta,
)
skip_variance = args.skip_variance

### Read the data
reader = FastaUtility()
Xtrain, Ytrain = reader.read_data(train_file)
Xtest, Ytest = reader.read_data(test_file)
Ytest = np.array(Ytest).reshape(-1, 1)

### Compute the fastsk kernel
start = time.time()
fastsk = FastSK(
    g=g, m=m, t=t, approx=approx, max_iters=I, delta=d, skip_variance=skip_variance
)

fastsk.compute_kernel(Xtrain, Xtest)
end = time.time()
print("Kernel computation time: ", end - start)
Xtrain = fastsk.get_train_kernel()
Xtest = fastsk.get_test_kernel()

### Use linear SVM
svm = LinearSVC(C=C)
clf = CalibratedClassifierCV(svm, cv=5).fit(Xtrain, Ytrain)
acc, auc = evaluate_clf(clf, Xtest, Ytest)
print("Linear SVM:\n\tAcc = {}, AUC = {}".format(acc, auc))
def expressionTest():
	
	trainX,trainY = pickle.load(open('MultiPieTrainExpression_XY.p','rb'))
	testX,testY = pickle.load(open('MultiPieValidationExpression_XY.p','rb'))
	
	
	f1 = open('RandomForestCompareResExpression.txt','w+')		
	

	clf_uncalibrated = RandomForestClassifier(n_estimators=1000,random_state=15325)
	clf_uncalibrated = clf_uncalibrated.fit(trainX,trainY)
	
	clf = CalibratedClassifierCV(clf_uncalibrated, cv=3, method='sigmoid')
	clf.fit(trainX, trainY)
	pickle.dump(clf,open('expression_multipie_rf_calibrated.p','wb'))
	
	
	#clf=pickle.load(open('gender_randomForest.p', 'rb'))
	
	
	tn = time.time()
	probX = clf.predict_proba(testX)
	preY = clf.predict(testX)
	et = time.time() - tn
	print probX
	
	#testY, probX[:,1]
	
	#storing the images along with probability values in the validation dataset.
	imageNumbers = range(0,len(testY))
	print imageNumbers
	with open('rf_expression_imageProbabilities.csv', 'wb') as f:
		writer = csv.writer(f)	
		rows = zip(imageNumbers,probX[:,1])
		for row in rows:
			writer.writerow(row)
	
	#calculation and plot of roc_auc for male
	totalMale = testY.count(1)
	totalNotMale = testY.count(0)
	print totalMale
	print totalNotMale
	#totalMale = sum(testY==1)*1.0
	#totalNotMale = sum(testY==0)*1.0
	roc_auc = dict()
	fpr, tpr, thresholds = roc_curve(testY, probX[:,1], pos_label=1)
	roc_auc = auc(fpr, tpr)	
	print>>f1,roc_auc
	print>>f1,fpr
	print>>f1,tpr
	print>>f1,'thresholds'
	print>>f1,thresholds
	
	
	#storing the threshold value along with the tpr and fpr values.
	with open('rf_expression_threhsolds.csv', 'wb') as f:
		writer = csv.writer(f)	
		rows = zip(thresholds,tpr,fpr)
		for row in rows:
			writer.writerow(row)
	   
	#pickle.dump([fpr,tpr,thresholds],open('rf_threshold.p','wb'))
	'''
	print>>f1,'total detection'
	print>>f1,fpr+tpr
	'''
	
	plt.title('Receiver Operating Characteristic')
	plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
	plt.legend(loc = 'lower right')
	plt.plot([0, 1], [0, 1],'r--')
	plt.xlim([0, 1])
	plt.ylim([0, 1])
	plt.ylabel('True Positive Rate')
	plt.xlabel('False Positive Rate')
	plt.savefig('rocExpressionRF.png')	
	plt.show()
	
	
	#choosing the best threshold on roc curve	
	mindist = 100;
	minI=0;
	for i in range(len(fpr)):
		a = np.array((fpr[i],tpr[i]))
		b = np.array((0,1))
		dist_a_b = distance.euclidean(a,b)
		if dist_a_b < mindist:
			mindist = dist_a_b
			minI =i
			minX = fpr[minI]
			minY = tpr[minI]
			threshold = thresholds[minI]
			
	print>>f1, 'minX :%f,  minY: %f, mindist:%f, Threshold = %f, minI =%d , fpr[min]=%f, tpr[min]=%f, false detection value= %f, true detection value =%f '%(minX,minY,mindist,threshold,minI, fpr[minI],tpr[minI], fpr[minI]*totalNotMale,tpr[minI]*totalMale)

	#storing the selectivity and accuracy values 
	
	prec = sum(preY == testY)*1.0/len(preY)
	select_1 = sum(preY==1)*1.0/len(preY) #Male
	select_2 = sum(preY==0)*1.0/len(preY) #Female
	f1.write('Gender time: %f acc. %f select_1 %f preY:%f testY:%f testX:%f select_2 %f\n'%\
	(et/len(preY),prec,select_1,len(preY),len(testY),len(testX),select_2))
	list1, list2 = (list(x) for x in zip(*sorted(zip(probX[:,1], testY), key=lambda pair: pair[0])))
	
	print list1
	print list2
	# lower correct is for low threshold
	lowerCorrect = (list2.index(1))
	higherCorrect = ((list2[::-1].index(0) + 1) -1)
	yesAccuracy = float(higherCorrect)/ len(list2)
	noAccuracy = float(lowerCorrect) / len(list2)
	
	print 'higherCorrect : %d, yesAccuracy : %f,  lowerCorrect: %d, noAccuracy: %f'%(higherCorrect,yesAccuracy,lowerCorrect,noAccuracy)  
	print 'lower threshold: %f ,   upperThreshold: %f'%(list1[lowerCorrect],list1[len(list1)-higherCorrect])
	print>>f1,'higherCorrect : %d, yesAccuracy : %f,  lowerCorrect: %d, noAccuracy: %f'%(higherCorrect,yesAccuracy,lowerCorrect,noAccuracy)  
	print>>f1,'lower threshold: %f ,   upperThreshold: %f'%(list1[lowerCorrect],list1[len(list1)-higherCorrect])
	
	for prob in (probX):
		print>>f1, prob
	print>>f1, 'predicted value'
	for pred in preY :
		print>>f1, pred
	print>>f1, 'True value'
	for truth in testY :
		print>>f1, truth
		
	for sortedProb in list1 :
		print>>f1, sortedProb
	for sortedTruth in list2 :
		print>>f1, sortedTruth
	
	f1.flush()
def get_hyperparameters(model):
    """
    Generates the models with different hyperparameters to be trained and evaluated
    using spatial cross validation.
    
    Args:
        model (str) : A string indicating the model or classifier to fetch hyperparameters
                      for. Supported models include 'logistic_regression', 'random_forest',
                      and 'linear_svc'.
        
    Returns:
        models (list) : A list of models, where each model is instantiated using different
                        hyperparameter settings.
        labels (list) : A list of labels indicating the corresponding model hyperparameters
                        in string format. The labels are used for plotting charts and file
                        naming schemes.
    """

    if model == 'logistic_regression':
        param_grid = {'penalty': ['l2', 'l1'], 'C': [0.001, 0.01, 0.1, 1]}
        params = list(
            itertools.product(*[param_grid[param] for param in param_grid]))
        models, labels = [], []
        for param in params:
            models.append(LogisticRegression(penalty=param[0], C=param[1]))
            labels.append('penalty={}, C={:.3f}'.format(param[0], param[1]))

        return models, labels

    if model == 'linear_svc':
        param_grid = {
            'C': [0.001, 0.01, 0.1, 1],
        }
        params = list(
            itertools.product(*[param_grid[param] for param in param_grid]))

        models, labels = [], []
        for param in params:
            models.append(
                CalibratedClassifierCV(LinearSVC(C=param[0],
                                                 random_state=SEED)))
            labels.append('C={:.3f}'.format(param[0]))

        return models, labels

    if model == 'random_forest':
        param_grid = {
            'n_estimators': [100, 300, 500, 800, 1200],
            'max_depth': [5, 8, 12],
            'min_samples_split': [2, 5, 10, 15],
            'min_samples_leaf': [1, 2, 5, 10]
        }
        params = list(
            itertools.product(*[param_grid[param] for param in param_grid]))

        # Randomly sample 5 parameter settings due to
        # the large number of combinations
        random.seed(SEED)
        params = random.sample(params, 5)

        models, labels = [], []
        for param in params:
            models.append(
                RandomForestClassifier(n_estimators=param[0],
                                       max_depth=param[1],
                                       min_samples_split=param[2],
                                       min_samples_leaf=param[3],
                                       random_state=SEED))
            labels.append(
                'n_estimators={}, max_depth={}, min_samples_split={}, min_samples_leaf={}'
                .format(param[0], param[1], param[2], param[3]))

        return models, labels
def main(configuration_path, signal_path, background_path, predictions_path,
         model_path, verbose):
    '''
    Train a classifier on signal and background monte carlo data and write the model
    to MODEL_PATH in pmml or pickle format.

    CONFIGURATION_PATH: Path to the config yaml file

    SIGNAL_PATH: Path to the signal data

    BACKGROUND_PATH: Path to the background data

    PREDICTIONS_PATH : path to the file where the mc predictions are stored.

    MODEL_PATH: Path to save the model to. Allowed extensions are .pkl and .pmml.
        If extension is .pmml, then both pmml and pkl file will be saved
    '''
    log = setup_logging(verbose=verbose)

    check_extension(predictions_path)
    check_extension(model_path, allowed_extensions=['.pmml', '.pkl', '.onnx'])

    config = AICTConfig.from_yaml(configuration_path)
    model_config = config.separator
    label_text = model_config.output_name

    log.info('Loading signal data')
    df_signal = read_telescope_data(
        signal_path,
        config,
        model_config.columns_to_read_train,
        feature_generation_config=model_config.feature_generation,
        n_sample=model_config.n_signal)
    df_signal['label_text'] = 'signal'
    df_signal['label'] = 1

    log.info('Loading background data')
    df_background = read_telescope_data(
        background_path,
        config,
        model_config.columns_to_read_train,
        feature_generation_config=model_config.feature_generation,
        n_sample=model_config.n_background)
    df_background['label_text'] = 'background'
    df_background['label'] = 0

    df = pd.concat([df_background, df_signal], ignore_index=True)

    df_train = convert_to_float32(df[model_config.features])
    log.debug('Total training events: {}'.format(len(df_train)))

    df_train.dropna(how='any', inplace=True)
    log.debug('Training events after dropping nans: {}'.format(len(df_train)))

    label = df.loc[df_train.index, 'label']

    # load optional columns if available to be able to make performance plots
    # vs true energy / size
    if config.true_energy_column is not None:
        true_energy = df.loc[df_train.index,
                             config.true_energy_column].to_numpy()
    if config.size_column is not None:
        size = df.loc[df_train.index, config.size_column].to_numpy()

    n_gammas = len(label[label == 1])
    n_protons = len(label[label == 0])
    log.info(
        'Training classifier with {} background and {} signal events'.format(
            n_protons, n_gammas))
    log.debug(model_config.features)

    # save prediction_path for each cv iteration
    cv_predictions = []

    # iterate over test and training sets
    X = df_train.values
    y = label.values
    n_cross_validations = model_config.n_cross_validations
    classifier = model_config.model

    log.info(
        'Starting {} fold cross validation... '.format(n_cross_validations))

    stratified_kfold = model_selection.StratifiedKFold(
        n_splits=n_cross_validations, shuffle=True, random_state=config.seed)

    aucs = []
    cv_it = stratified_kfold.split(X, y)
    for fold, (train, test) in enumerate(tqdm(cv_it,
                                              total=n_cross_validations)):
        # select data
        xtrain, xtest = X[train], X[test]
        ytrain, ytest = y[train], y[test]

        # fit and predict
        classifier.fit(xtrain, ytrain)

        y_probas = classifier.predict_proba(xtest)[:, 1]

        cv_df = pd.DataFrame({
            'label': ytest,
            model_config.output_name: y_probas,
            'cv_fold': fold,
        })
        if config.true_energy_column is not None:
            cv_df[config.true_energy_column] = true_energy[test]
        if config.size_column is not None:
            cv_df[config.size_column] = size[test]
        cv_predictions.append(cv_df)
        aucs.append(metrics.roc_auc_score(ytest, y_probas))

    aucs = np.array(aucs)
    log.info('Cross-validation ROC-AUCs: {}'.format(aucs))
    log.info('Mean AUC ROC : {:.3f} ± {:.3f}'.format(aucs.mean(), aucs.std()))

    predictions_df = pd.concat(cv_predictions, ignore_index=True)
    log.info('Writing predictions from cross validation')
    write_data(predictions_df, predictions_path, mode='w')

    # set random seed again to make sure different settings
    # for n_cross_validations don't change the final model
    np.random.seed(config.seed)
    classifier.random_state = config.seed

    if model_config.calibrate_classifier:
        log.info('Training calibrated classifier')
        classifier = CalibratedClassifierCV(classifier, cv=2, method='sigmoid')
        classifier.fit(X, y)
    else:
        log.info('Training model on complete dataset')
        classifier.fit(X, y)

    log.info('Saving model to {} ...'.format(model_path))
    save_model(classifier,
               model_path=model_path,
               label_text=label_text,
               feature_names=list(df_train.columns))
y[:n_samples // 2] = 0
y[n_samples // 2:] = 1
sample_weight = np.random.RandomState(42).rand(y.shape[0])

# split train, test for calibration
X_train, X_test, y_train, y_test, sw_train, sw_test = \
    train_test_split(X, y, sample_weight, test_size=0.9, random_state=42)

# Gaussian Naive-Bayes with no calibration
clf = GaussianNB()
clf.fit(X_train, y_train)  # GaussianNB itself does not support sample-weights
prob_pos_clf = clf.predict_proba(X_test)[:, 1]

# Gaussian Naive-Bayes with isotonic calibration
clf_isotonic = CalibratedClassifierCV(clf, cv=2, method='isotonic')
clf_isotonic.fit(X_train, y_train, sw_train)
prob_pos_isotonic = clf_isotonic.predict_proba(X_test)[:, 1]

# Gaussian Naive-Bayes with sigmoid calibration
clf_sigmoid = CalibratedClassifierCV(clf, cv=2, method='sigmoid')
clf_sigmoid.fit(X_train, y_train, sw_train)
prob_pos_sigmoid = clf_sigmoid.predict_proba(X_test)[:, 1]

print("Brier scores: (the smaller the better)")

clf_score = brier_score_loss(y_test, prob_pos_clf, sw_test)
print("No calibration: %1.3f" % clf_score)

clf_isotonic_score = brier_score_loss(y_test, prob_pos_isotonic, sw_test)
print("With isotonic calibration: %1.3f" % clf_isotonic_score)
Example #36
0
 def _svc(self):
     # self.model = SVC(kernel='linear', C=1000)
     self.model = LinearSVC(C=200)
     self.clf = CalibratedClassifierCV(self.model, method='sigmoid')
     self.clf.fit(self.features, self.dialects)
     self.model.fit(self.features, self.dialects)
def majority_vote_cl(v: int,
                     n: int,
                     delta: float,
                     train_file: str,
                     test_file: str,
                     min_token_freq: int = 1,
                     max_token_freq: float = 1.0):
    """
    Entry point of program.
    :param max_token_freq: ignore terms that have a document frequency strictly higher than the given proportion.
    :param min_token_freq: ignore terms that have a document frequency strictly lower than the given threshold.
    :param v: Vocabulary choice
    :param n: ngram choice
    :param delta: Smoothing choice
    :param train_file: Path to training data
    :param test_file: Path to testing data
    :return: void
    """
    validate_params(v, n, delta, train_file, test_file)

    # Process data
    train_data = pd.read_csv(
        train_file,
        delimiter='\t',
        names=[DF_COLUMN_ID, DF_COLUMN_NAME, DF_COLUMN_LANG, DF_COLUMN_TWEET])
    test_data = pd.read_csv(
        test_file,
        delimiter='\t',
        names=[DF_COLUMN_ID, DF_COLUMN_NAME, DF_COLUMN_LANG, DF_COLUMN_TWEET])

    lang_mapping, inv_lang_mapping = encode_class_labels(
        train_data[DF_COLUMN_LANG])
    train_data[DF_COLUMN_LANG] = train_data[DF_COLUMN_LANG].map(lang_mapping)
    custom_transform_to_vocab(train_data, v)
    custom_transform_to_vocab(test_data, v)

    # Prepare features (Ngrams and their weights)
    tfidf = TfidfVectorizer(analyzer='char_wb',
                            lowercase=False,
                            ngram_range=(n, n),
                            min_df=min_token_freq,
                            max_df=max_token_freq)
    features = tfidf.fit_transform(train_data[DF_COLUMN_TWEET]).toarray()
    labels = train_data[DF_COLUMN_LANG]

    # Define Estimators
    svc = LinearSVC()
    svc_calibrated = CalibratedClassifierCV(svc)
    lr = LogisticRegression(multi_class='multinomial', max_iter=500)
    estimators = [('lr', lr), ('svc_calibrated', svc_calibrated)]

    # Train model
    voting_classifier = VotingClassifier(estimators=estimators,
                                         voting='soft',
                                         n_jobs=-1)
    voting_classifier.fit(features, labels)

    # Calculate scores
    features_test = tfidf.transform(test_data[DF_COLUMN_TWEET])
    guess = voting_classifier.predict(features_test)
    scores = voting_classifier.predict_proba(features_test)

    # Finalize results
    results = prepare_result_df(test_data)
    results[DF_COLUMN_SCORE] = scores
    results[DF_COLUMN_GUESS] = guess
    results[DF_COLUMN_GUESS] = results[DF_COLUMN_GUESS].map(inv_lang_mapping)
    results = finalize_result_df(results)
    generate_trace_file(v, n, delta, results)

    # Evaluation stats
    print(
        "\nEvaluating Majority Vote classifier with parameters: [vocabulary = {}, ngram size = {}, delta = {}]"
        .format(v, n, delta))
    evaluate_results(results, v, n, delta)
    return results
Example #38
0
def trainAndSaveMetaClassifier(X_metaTraining, y_meta, classifierType):
    '''
	Trains a meta classifier from a training feature vector, built from data dedicated to the meta classifier training, called X_metaTraining.
	@param X_metaTraining: Feature vector for the meta classifier training. Predictions from low layers will be done from this vector.
	@param y_meta: Truth vector regarding the X_metaTraining vector
	@param classifierType: The type of classifier to train. Possible values are 'color-histogram', 'face-detection', 'lbp', 'object-detection'.
    '''
    input_meta_image = getInputMetaImageFromLowClassifiers(
        X_metaTraining, classifierType)

    # Training the meta classifier from the outputs of low classifier for the current feature
    from sklearn.model_selection import cross_val_score
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.calibration import CalibratedClassifierCV
    from sklearn.svm import LinearSVC

    print('Training meta classifier for ' + classifierType +
          ' -- This may take some time..')

    classifiers = [
        MultinomialNB(),
        RandomForestClassifier(random_state=0,
                               n_estimators=20,
                               n_jobs=-1,
                               max_features=None),
        DecisionTreeClassifier(random_state=0, max_features=None),
        LinearSVC(random_state=0)
    ]

    best_classifier = None
    best_accuracy = 0
    best_stdDev = 0

    for clf in classifiers:
        current_scores = cross_val_score(clf,
                                         input_meta_image,
                                         y_meta,
                                         cv=20,
                                         scoring='accuracy',
                                         n_jobs=2,
                                         verbose=1)
        if current_scores.mean() > best_accuracy:
            best_accuracy = current_scores.mean()
            best_classifier = clf
            best_stdDev = current_scores.std()

    # Training the classifier on the whole data
    if str(type(best_classifier)) == "<class 'sklearn.svm.classes.LinearSVC'>":
        best_classifier = CalibratedClassifierCV(best_classifier)
    best_classifier.fit(input_meta_image, y_meta)

    # Saving the classifier
    import pickle
    pickle.dump(
        best_classifier,
        open(
            'trained-classifiers/low-classifiers/' + classifierType +
            '/meta/' + classifierType + '-meta.p', "wb"))

    print('Best classifier saved for meta image:', best_classifier)
    print('Best accuracy:', best_accuracy)
    print('Standard deviation', best_stdDev)
Example #39
0
class Classifiers:
    """
    Usage:
           1. Generating models:
           After creating instances of this class, invoke the method 'training'
           with the path as its parameter, and after that, when you want to get
           the resulting model, just invoke its class variable 'model'

           2. Test model:
           Invoking 'testing' function with path to the test set as parameter,
           it will return f1 score
    """
    def __init__(self, n_value, intercept, mode='character'):
        self.n_value = n_value
        self.mode = mode
        if self.mode != 'character' and self.mode != 'word':
            raise ValueError('the mode has to be either character or word')
        self.features = None
        self.dialects = list()
        self.features_names = list()
        self.length = 0
        self.width = 0
        self.model = None
        self.clf = None
        self.intercept = intercept
        self.test_dialects = None

    def _char_n_grams(self, sentence):
        return [
            sentence[i:i + self.n_value]
            for i in range(len(sentence) - self.n_value + 1)
        ]

    def _word_n_grams(self, sentence):
        ngram = list()
        sentence = sentence.strip().split(" ")
        for i in range(len(sentence) - self.n_value + 1):
            gram = ""
            for j in range(self.n_value):
                gram = gram + sentence[i + j]
            ngram.append(gram)
        return ngram

    def training(self, training_set_path):

        sentences = list()

        with open(training_set_path, 'r', encoding='utf8') as training_file:
            for line in training_file:
                the_sentence, the_dialect = line.strip().split('\t')
                the_sentence = the_sentence.strip()
                sentences.append('#' + the_sentence + '#')
                self.dialects.append(the_dialect)

        if self.mode == 'character':
            tfidf = TfidfVectorizer(ngram_range=(self.n_value, self.n_value),
                                    analyzer='char')
            self.features = tfidf.fit_transform(sentences).toarray()
            self.features_names = tfidf.get_feature_names()

        elif self.mode == 'word':
            tfidf = TfidfVectorizer(ngram_range=(self.n_value, self.n_value),
                                    analyzer='word')
            self.features = tfidf.fit_transform(sentences).toarray()
            self.features_names = tfidf.get_feature_names()

        self.length = len(sentences)
        self.width = self.features.shape[1]

        self._svc()

    def _svc(self):
        # self.model = SVC(kernel='linear', C=1000)
        self.model = LinearSVC(C=200)
        self.clf = CalibratedClassifierCV(self.model, method='sigmoid')
        self.clf.fit(self.features, self.dialects)
        self.model.fit(self.features, self.dialects)

    def testing(self, testing_set_path):
        test_sentences = []
        #self.test_dialects = []
        with open(testing_set_path, 'r', encoding='utf8') as test_file:
            for line in test_file:
                #s, label = line.strip().split('\t')
                #s = s.strip()
                s = line.strip()
                test_sentences.append('#' + s + '#')
                #self.test_dialects.append(label)

        s_feat = []

        if self.mode == 'character':
            for s in test_sentences:
                ngram = self._char_n_grams(s)
                s_feat.append(set(ngram))
        elif self.mode == 'word':
            for s in test_sentences:
                ngram = self._word_n_grams(s)
                s_feat.append(set(ngram))

        test_features = np.zeros((self.length, self.width), dtype=np.int8)

        for i, s in enumerate(s_feat):
            for j, ngram in enumerate(self.features_names):
                if ngram in s:
                    test_features[i, j] += 1

        #for i in range(self.length-len(test_sentences)):
        #   self.test_dialects.append(self.intercept)

        result = self.model.predict(X=test_features)
        #f1_score = sklearn.metrics.f1_score(self.test_dialects, result[:len(self.test_dialects)], average='macro')

        probability_matrix, label = fusion_methods.mean_probability_rule(
            test_features, self.clf)

        # score = self.model.score(test_features, test_dialects)
        # accuracy = ((3000 * score - 2500) / 2000) * 100

        return probability_matrix, label, result

    def get_test_dialects(self):
        return self.test_dialects
#   calibration (see :ref:`User Guide <calibration>`)
#
# Calibration curves for all 4 conditions are plotted below, with the average
# predicted probability for each bin on the x-axis and the fraction of positive
# classes in each bin on the y-axis.

import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec

from sklearn.calibration import CalibratedClassifierCV, CalibrationDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

lr = LogisticRegression(C=1.0)
gnb = GaussianNB()
gnb_isotonic = CalibratedClassifierCV(gnb, cv=2, method="isotonic")
gnb_sigmoid = CalibratedClassifierCV(gnb, cv=2, method="sigmoid")

clf_list = [
    (lr, "Logistic"),
    (gnb, "Naive Bayes"),
    (gnb_isotonic, "Naive Bayes + Isotonic"),
    (gnb_sigmoid, "Naive Bayes + Sigmoid"),
]

# %%
fig = plt.figure(figsize=(10, 10))
gs = GridSpec(4, 2)
colors = plt.cm.get_cmap("Dark2")

ax_calibration_curve = fig.add_subplot(gs[:2, :2])
Example #41
0
def fscore(params_org):
    #print(params_org)
    parambk = copy.deepcopy(params_org)
    ifError =0
    global best, HPOalg,params_best, errorcount
    params= params_org['classifier']
    classifier = params.pop('name')
    p_random_state = params.pop('random_state')
    
    if (classifier == 'SVM'):  
        param_value= params.pop('gamma_value')
        if(params['gamma'] == "value"):
            params['gamma'] = param_value
        else:
            pass   
        clf = SVC(max_iter = 10000, cache_size= 700, random_state = p_random_state,**params)
        #max_iter=10000 and cache_size= 700 https://github.com/EpistasisLab/pennai/issues/223
        #maxvalue https://github.com/hyperopt/hyperopt-sklearn/blob/fd718c44fc440bd6e2718ec1442b1af58cafcb18/hpsklearn/components.py#L262
    elif(classifier == 'RF'):        
        clf = RandomForestClassifier(random_state = p_random_state, **params)
    elif(classifier == 'KNN'):
        p_value = params.pop('p')
        if(p_value==0):
            params['metric'] = "chebyshev"
        elif(p_value==1):
            params['metric'] = "manhattan"
        elif(p_value==2):
            params['metric'] = "euclidean"
        else:
            params['metric'] = "minkowski"
            params['p'] = p_value
        #https://github.com/hyperopt/hyperopt-sklearn/blob/fd718c44fc440bd6e2718ec1442b1af58cafcb18/hpsklearn/components.py#L302
        clf = KNeighborsClassifier(**params)
    elif(classifier == 'DTC'):        
        clf = DecisionTreeClassifier(random_state = p_random_state, **params)
    elif(classifier == 'LR'):        
        penalty_solver = params.pop('penalty_solver')
        params['penalty'] = penalty_solver.split("+")[0]
        params['solver'] = penalty_solver.split("+")[1]
        clf = LogisticRegression(random_state = p_random_state, **params)
    #resampling parameter
    p_sub_params= params_org.pop('sub')
    p_sub_type = p_sub_params.pop('type')
    sampler = p_sub_params.pop('smo_grp')
    gmean = []
    if (p_sub_type == 'SMOTE'):
        smo = SMOTE(**p_sub_params)
    elif (p_sub_type == 'ADASYN'):
        smo = ADASYN(**p_sub_params)
    elif (p_sub_type == 'BorderlineSMOTE'):
        smo = BorderlineSMOTE(**p_sub_params)
    elif (p_sub_type == 'SVMSMOTE'):
        smo = SVMSMOTE(**p_sub_params)
    elif (p_sub_type == 'SMOTENC'):
        smo = SMOTENC(**p_sub_params)
    elif (p_sub_type == 'KMeansSMOTE'):
        smo = KMeansSMOTE(**p_sub_params)
    elif (p_sub_type == 'RandomOverSampler'):
        smo = RandomOverSampler(**p_sub_params)
#Undersampling
    elif (p_sub_type == 'TomekLinks'):
        smo = TomekLinks(**p_sub_params)
    elif (p_sub_type == 'ClusterCentroids'):
        if(p_sub_params['estimator']=='KMeans'):
            p_sub_params['estimator']= KMeans(random_state = p_random_state)
        elif(p_sub_params['estimator']=='MiniBatchKMeans'):
            p_sub_params['estimator']= MiniBatchKMeans(random_state = p_random_state)
        smo = ClusterCentroids(**p_sub_params) 
    elif (p_sub_type == 'RandomUnderSampler'):
        smo = RandomUnderSampler(**p_sub_params)
    elif (p_sub_type == 'NearMiss'):
        smo = NearMiss(**p_sub_params)
    elif (p_sub_type == 'InstanceHardnessThreshold'):
        if(p_sub_params['estimator']=='knn'):
            p_sub_params['estimator']= KNeighborsClassifier()
        elif(p_sub_params['estimator']=='decision-tree'):
            p_sub_params['estimator']=DecisionTreeClassifier()
        elif(p_sub_params['estimator']=='adaboost'):
            p_sub_params['estimator']=AdaBoostClassifier()
        elif(p_sub_params['estimator']=='gradient-boosting'):
            p_sub_params['estimator']=GradientBoostingClassifier()
        elif(p_sub_params['estimator']=='linear-svm'):
            p_sub_params['estimator']=CalibratedClassifierCV(LinearSVC())
        elif(p_sub_params['estimator']=='random-forest'):
            p_sub_params['estimator']=RandomForestClassifier(n_estimators=100)
        smo = InstanceHardnessThreshold(**p_sub_params) 
    elif (p_sub_type == 'CondensedNearestNeighbour'):
        smo = CondensedNearestNeighbour(**p_sub_params)
    elif (p_sub_type == 'EditedNearestNeighbours'):
        smo = EditedNearestNeighbours(**p_sub_params)
    elif (p_sub_type == 'RepeatedEditedNearestNeighbours'):
        smo = RepeatedEditedNearestNeighbours(**p_sub_params) 
    elif (p_sub_type == 'AllKNN'):
        smo = AllKNN(**p_sub_params)
    elif (p_sub_type == 'NeighbourhoodCleaningRule'):
        smo = NeighbourhoodCleaningRule(**p_sub_params) 
    elif (p_sub_type == 'OneSidedSelection'):
        smo = OneSidedSelection(**p_sub_params)
#Combine
    elif (p_sub_type == 'SMOTEENN'):
        smo = SMOTEENN(**p_sub_params)
    elif (p_sub_type == 'SMOTETomek'):
        smo = SMOTETomek(**p_sub_params)
    e=''
    try:        
        for train, test in cv.split(X, y):
            if(p_sub_type=='NO'):
                X_smo_train, y_smo_train = X[train], y[train]
            else:
                X_smo_train, y_smo_train = smo.fit_sample(X[train], y[train])
            y_test_pred = clf.fit(X_smo_train, y_smo_train).predict(X[test])
            gm = geometric_mean_score(y[test], y_test_pred, average='binary')
            gmean.append(gm)
        mean_g=np.mean(gmean)
    except Exception as eec:
        e=eec
        mean_g = 0
        ifError =1 
        errorcount = errorcount+1
    gm_loss = 1 - mean_g
    abc=time.time()-starttime
    if mean_g > best:
        best = mean_g
        params_best = copy.deepcopy(parambk)
    return {'loss': gm_loss,
            'mean': mean_g,
            'status': STATUS_OK,         
            # -- store other results like this
            'run_time': abc,
            'iter': iid,
            'current_best': best,
            'eval_time': time.time(),            
            'SamplingGrp': sampler,
            'SamplingType': p_sub_type,
            'ifError': ifError,
            'Error': e,
            'params' : parambk,
            'attachments':
                {'time_module': pickle.dumps(time.time)}
           }   
Example #42
0
    #clf = ensemble.RandomForestClassifier(**paramsRF)

    clf = linear_model.LogisticRegressionCV(Cs=4,
                                            solver='liblinear',
                                            max_iter=1000,
                                            tol=1e-5,
                                            scoring='neg_log_loss')
    #clf = GaussianNB()
    #clf = linear_model.ElasticNetCV(l1_ratio=0)
    #clf = GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True)
    #clf=linear_model.LassoLarsIC(criterion='aic')

    clf.fit(X_train, y_train)

    sig_clf = CalibratedClassifierCV(clf, method="sigmoid", cv='prefit')
    sig_clf.fit(X_valid, y_valid)

    df_out.loc[test, 'prob1'] = sig_clf.predict_proba(X_test)[:, 0]
    df_out.loc[test, 'prob2'] = sig_clf.predict_proba(X_test)[:, 1]

df_out.loc[:, 'log_loss1'] = df_out.loc[:, 'result'] * np.log(df_out.loc[:, 'prob1']) \
                             + (1 - df_out.loc[:, 'result']) * np.log((1 - df_out.loc[:, 'prob1']))

df_out.loc[:, 'log_loss2'] = df_out.loc[:, 'result'] * np.log(df_out.loc[:, 'prob2']) \
                             + (1 - df_out.loc[:, 'result']) * np.log((1 - df_out.loc[:, 'prob2']))

log_loss_1 = -df_out.loc[:, 'log_loss1'].sum() / len(df_out)
log_loss_2 = -df_out.loc[:, 'log_loss2'].sum() / len(df_out)

print(log_loss_1, log_loss_2)
Example #43
0
def genTrainData(rule='部门名称',
                 mode='thulac',
                 role='AGENT',
                 feature_type='TFIDF',
                 ngram_range=(1, 3),
                 _min=2,
                 _max=0.9,
                 _range=(0.4, 0.9),
                 max_features=10000):
    print('This is the classification task on {}'.format(rule))
    data = fenci(rule, mode)
    if role not in 'AGENT USER':
        data['sentenceList'] = data['sentenceList'].apply(str).apply(eval) \
            .apply(lambda x: ' '.join([i['content'] for i in x]))
    else:
        data['sentenceList'] = data['sentenceList'].apply(str).apply(eval)\
        .apply(lambda x: ' '.join([i['content'] for i in x if i['role'] == role]))
    data['label'] = data['label'].apply(str).apply(eval)
    # data.columns = ['UUID','sentenceList','label']
    # test_data = pd.read_csv('{}/{}_test.csv'.format(sample_path, rule))
    test_data = data.tail(int(0.2 * len(data.index)))
    _test = data.set_index('UUID').loc[test_data['UUID']]
    assert _test.shape[0] == test_data.shape[0]
    _train = data.set_index('UUID').drop(test_data['UUID'])
    assert _train.shape[0] + _test.shape[0] == data.shape[0]
    del (data, test_data)
    print('train/test:{}/{}'.format(_train.shape[0], _test.shape[0]))
    BDC_DF = select_Feature(_train['sentenceList'],
                            _train['label'],
                            _min=_min,
                            ngram_range=ngram_range,
                            _max=_max,
                            _range=_range,
                            max_features=max_features)
    _vocab = {j: i for i, j in enumerate(BDC_DF.index)}
    del (BDC_DF)
    if _test.shape[0] == 0:
        return
    # _vec = TfidfVectorizer(ngram_range=(1, 3), max_df=0.8, min_df=3, max_features=int(0.6*len(_vocab.keys())))
    # _vec.fit(_train['sentenceList'])
    # _vocab = {j:i for i,j in enumerate(set(_vocab.keys()).union(set(_vec.vocabulary_)))}
    _vec = TfidfVectorizer(vocabulary=_vocab)
    _vec.fit(_train['sentenceList'])
    assert _vec.vocabulary_ == _vocab
    print('特征维度:', len(_vocab.keys()))
    train_csr = _vec.transform(_train['sentenceList'])
    test_csr = _vec.transform(_test['sentenceList'])
    _train.drop('sentenceList', axis=1, inplace=True)
    _test.drop('sentenceList', axis=1, inplace=True)
    _train = _train.reset_index().drop('index', axis=1)
    # model = CalibratedClassifierCV(svm.LinearSVC(random_state=2018))
    model = CalibratedClassifierCV(
        lgb.LGBMClassifier(metric='auc', learning_rate=0.02))
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2018)
    models = []
    y_preds, y_trues = [], []
    for train, test in skf.split(range(len(_train.index)), _train['label']):
        y_pred = 0
        model.fit(train_csr[train], _train.iloc[train]['label'])
        models.append(model)
        y_pred += np.array(model.predict_proba(train_csr[test]))[:, 1]
        y_preds.extend(np.array(y_pred).round())
        y_trues.extend(_train.iloc[test]['label'])
    print('五折交叉的结果:')
    printMark(y_trues, y_preds)
    del (y_trues, y_preds, train, test)
    y_pred = 0
    for i in range(len(models)):
        y_pred += np.array(models[i].predict_proba(test_csr))[:,
                                                              1] / len(models)
    print('测试集的结果:')
    save_errorcase(_test.index, _test['label'], y_pred, rule)
    printMark(_test['label'], y_pred.round())
Example #44
0
def test_calibration():
    """Test calibration objects with isotonic and sigmoid"""
    n_samples = 100
    X, y = make_classification(n_samples=2 * n_samples, n_features=6,
                               random_state=42)
    sample_weight = np.random.RandomState(seed=42).uniform(size=y.size)

    X -= X.min()  # MultinomialNB only allows positive X

    # split train and test
    X_train, y_train, sw_train = \
        X[:n_samples], y[:n_samples], sample_weight[:n_samples]
    X_test, y_test = X[n_samples:], y[n_samples:]

    # Naive-Bayes
    clf = MultinomialNB().fit(X_train, y_train, sample_weight=sw_train)
    prob_pos_clf = clf.predict_proba(X_test)[:, 1]

    pc_clf = CalibratedClassifierCV(clf, cv=y.size + 1)
    assert_raises(ValueError, pc_clf.fit, X, y)

    # Naive Bayes with calibration
    for this_X_train, this_X_test in [(X_train, X_test),
                                      (sparse.csr_matrix(X_train),
                                       sparse.csr_matrix(X_test))]:
        for method in ['isotonic', 'sigmoid']:
            pc_clf = CalibratedClassifierCV(clf, method=method, cv=2)
            # Note that this fit overwrites the fit on the entire training
            # set
            pc_clf.fit(this_X_train, y_train, sample_weight=sw_train)
            prob_pos_pc_clf = pc_clf.predict_proba(this_X_test)[:, 1]

            # Check that brier score has improved after calibration
            assert_greater(brier_score_loss(y_test, prob_pos_clf),
                           brier_score_loss(y_test, prob_pos_pc_clf))

            # Check invariance against relabeling [0, 1] -> [1, 2]
            pc_clf.fit(this_X_train, y_train + 1, sample_weight=sw_train)
            prob_pos_pc_clf_relabeled = pc_clf.predict_proba(this_X_test)[:, 1]
            assert_array_almost_equal(prob_pos_pc_clf,
                                      prob_pos_pc_clf_relabeled)

            # Check invariance against relabeling [0, 1] -> [-1, 1]
            pc_clf.fit(this_X_train, 2 * y_train - 1, sample_weight=sw_train)
            prob_pos_pc_clf_relabeled = pc_clf.predict_proba(this_X_test)[:, 1]
            assert_array_almost_equal(prob_pos_pc_clf,
                                      prob_pos_pc_clf_relabeled)

            # Check invariance against relabeling [0, 1] -> [1, 0]
            pc_clf.fit(this_X_train, (y_train + 1) % 2,
                       sample_weight=sw_train)
            prob_pos_pc_clf_relabeled = \
                pc_clf.predict_proba(this_X_test)[:, 1]
            if method == "sigmoid":
                assert_array_almost_equal(prob_pos_pc_clf,
                                          1 - prob_pos_pc_clf_relabeled)
            else:
                # Isotonic calibration is not invariant against relabeling
                # but should improve in both cases
                assert_greater(brier_score_loss(y_test, prob_pos_clf),
                               brier_score_loss((y_test + 1) % 2,
                                                prob_pos_pc_clf_relabeled))

        # check that calibration can also deal with regressors that have
        # a decision_function
        clf_base_regressor = CalibratedClassifierCV(Ridge())
        clf_base_regressor.fit(X_train, y_train)
        clf_base_regressor.predict(X_test)

        # Check failure cases:
        # only "isotonic" and "sigmoid" should be accepted as methods
        clf_invalid_method = CalibratedClassifierCV(clf, method="foo")
        assert_raises(ValueError, clf_invalid_method.fit, X_train, y_train)

        # base-estimators should provide either decision_function or
        # predict_proba (most regressors, for instance, should fail)
        clf_base_regressor = \
            CalibratedClassifierCV(RandomForestRegressor(), method="sigmoid")
        assert_raises(RuntimeError, clf_base_regressor.fit, X_train, y_train)
clf = ExtraTreesClassifier(random_state=1729,
                           bootstrap=True,
                           class_weight="balanced")
selector = clf.fit(normalize(X), y)
# clf.feature_importances_
fs = SelectFromModel(selector, prefit=True)

X = fs.transform(X)
test = fs.transform(test)

print(X.shape, test.shape)

#m2_xgb = xgb.XGBClassifier(n_estimators=110, nthread=-1, max_depth = 4, \
#seed=1729)
m2_xgb = xgb.XGBClassifier(missing=np.nan,
                           max_depth=6,
                           n_estimators=350,
                           learning_rate=0.025,
                           nthread=4,
                           subsample=0.95,
                           colsample_bytree=0.85,
                           seed=4242)
metLearn = CalibratedClassifierCV(m2_xgb, method='isotonic', cv=10)
metLearn.fit(X, y)

## # Submission
probs = metLearn.predict_proba(test)

submission = pd.DataFrame({"ID": test_id, "TARGET": probs[:, 1]})
submission.to_csv("submission.csv", index=False)
# Generate the train set with the rest of the data.
train_data = data[test_cutoff:]
train_label = labels[test_cutoff:]

#KNN Classifier
neigh = KNeighborsClassifier(n_neighbors=5,
                             algorithm='auto',
                             metric='minkowski',
                             p=1)
neigh.fit(train_data, train_label)
predictions_knn = neigh.predict(test_data)

#SVM Classifier
svc = svm.LinearSVC(random_state=0)
svc = OneVsRestClassifier(svc)
clf = CalibratedClassifierCV(svc, cv=10)
clf.fit(train_data, train_label)
predictions_svm = clf.predict(test_data)

#Decision Tree Classifier
clf = tree.DecisionTreeClassifier()
clf = clf.fit(train_data, train_label)
predictions_decision = clf.predict(test_data)

#Neural Network Classifier
clf = MLPClassifier(solver='lbfgs',
                    alpha=1e-5,
                    hidden_layer_sizes=(100, ),
                    random_state=1,
                    activation='tanh')
clf.fit(train_data, train_label)
Example #47
0
score = metrics.log_loss(my_test["OutcomeType"], probs)
print(score)

alg10 = GaussianNB()
alg10.fit(train[features], train["OutcomeType"])
probs = alg10.predict_proba(my_test[features])
score = metrics.log_loss(my_test["OutcomeType"], probs)
print(score)

alg2 = RandomForestClassifier()
alg2.fit(train[features], train["OutcomeType"])
probs = alg2.predict_proba(my_test[features])
score = metrics.log_loss(my_test["OutcomeType"], probs)
print(score)

alg7 = CalibratedClassifierCV()
alg7.fit(train[features], train["OutcomeType"])
probs = alg7.predict_proba(my_test[features])
score = metrics.log_loss(my_test["OutcomeType"], probs)
print(score)

alg6 = DecisionTreeClassifier()
alg6.fit(train[features], train["OutcomeType"])
probs = alg6.predict_proba(my_test[features])
score = metrics.log_loss(my_test["OutcomeType"], probs)
print(score)

alg4 = AdaBoostClassifier()
alg4.fit(train[features], train["OutcomeType"])
probs = alg4.predict_proba(my_test[features])
score = metrics.log_loss(my_test["OutcomeType"], probs)
Example #48
0
def test_calibration_multiclass(method, ensemble, seed):
    def multiclass_brier(y_true, proba_pred, n_classes):
        Y_onehot = np.eye(n_classes)[y_true]
        return np.sum((Y_onehot - proba_pred)**2) / Y_onehot.shape[0]

    # Test calibration for multiclass with classifier that implements
    # only decision function.
    clf = LinearSVC(random_state=7)
    X, y = make_blobs(n_samples=500,
                      n_features=100,
                      random_state=seed,
                      centers=10,
                      cluster_std=15.0)

    # Use an unbalanced dataset by collapsing 8 clusters into one class
    # to make the naive calibration based on a softmax more unlikely
    # to work.
    y[y > 2] = 2
    n_classes = np.unique(y).shape[0]
    X_train, y_train = X[::2], y[::2]
    X_test, y_test = X[1::2], y[1::2]

    clf.fit(X_train, y_train)

    cal_clf = CalibratedClassifierCV(clf,
                                     method=method,
                                     cv=5,
                                     ensemble=ensemble)
    cal_clf.fit(X_train, y_train)
    probas = cal_clf.predict_proba(X_test)
    # Check probabilities sum to 1
    assert_allclose(np.sum(probas, axis=1), np.ones(len(X_test)))

    # Check that the dataset is not too trivial, otherwise it's hard
    # to get interesting calibration data during the internal
    # cross-validation loop.
    assert 0.65 < clf.score(X_test, y_test) < 0.95

    # Check that the accuracy of the calibrated model is never degraded
    # too much compared to the original classifier.
    assert cal_clf.score(X_test, y_test) > 0.95 * clf.score(X_test, y_test)

    # Check that Brier loss of calibrated classifier is smaller than
    # loss obtained by naively turning OvR decision function to
    # probabilities via a softmax
    uncalibrated_brier = \
        multiclass_brier(y_test, softmax(clf.decision_function(X_test)),
                         n_classes=n_classes)
    calibrated_brier = multiclass_brier(y_test, probas, n_classes=n_classes)

    assert calibrated_brier < 1.1 * uncalibrated_brier

    # Test that calibration of a multiclass classifier decreases log-loss
    # for RandomForestClassifier
    clf = RandomForestClassifier(n_estimators=30, random_state=42)
    clf.fit(X_train, y_train)
    clf_probs = clf.predict_proba(X_test)
    uncalibrated_brier = multiclass_brier(y_test,
                                          clf_probs,
                                          n_classes=n_classes)

    cal_clf = CalibratedClassifierCV(clf,
                                     method=method,
                                     cv=5,
                                     ensemble=ensemble)
    cal_clf.fit(X_train, y_train)
    cal_clf_probs = cal_clf.predict_proba(X_test)
    calibrated_brier = multiclass_brier(y_test,
                                        cal_clf_probs,
                                        n_classes=n_classes)
    assert calibrated_brier < 1.1 * uncalibrated_brier
Example #49
0
    vec = TfidfVectorizer(ngram_range=(1,wins), min_df=3, max_df=0.9,use_idf=1,smooth_idf=1, sublinear_tf=1)
    
    kfold_x_train = train[column][train_index]
    kfold_x_valid = train[column][test_index]
    k_y_train = (train['class']-1).astype(int)[train_index]
    k_y_valid = (train['class']-1).astype(int)[test_index]

    print('获得tfidf特征')
    k_trn_term_doc = vec.fit_transform(kfold_x_train)
    k_test_term_doc = vec.transform(kfold_x_valid)
    test_term_doc = vec.transform(test[column])

    # 拟合数据
    print('拟合数据')
    lin_clf = svm.LinearSVC()
    lin_clf = CalibratedClassifierCV(lin_clf)
    lin_clf.fit(k_trn_term_doc, k_y_train)

    # 预测结果
    print('预测结果')
    oof_predict[test_index] += lin_clf.predict_proba(k_test_term_doc) / n_folds
    predict += lin_clf.predict_proba(test_term_doc) / n_folds

    # 计算准确度
    p_l = []
    for row in oof_predict[test_index]:
        p_l.append(np.argmax(eval(row)))

    accuracy = accuracy_score(p_l, k_y_valid.values)
    f1 = f1_score(p_l, k_y_valid.values)
    
import pickle
from sklearn.preprocessing import normalize
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
import numpy as np

iter = 5

with open('ListOfBestParamsRS.pkl', 'rb') as f:
    best_params = pickle.load(f)

path = "C://Users//Arushi//PycharmProjects//ThesisChap2//ClusteringBuckets//"

for i in range(iter):
    X_train = np.load(path + 'final_train_binarydata_' + str(i) + '.npy')
    Y_train = np.load(path + 'final_train_labels_' + str(i) + '.npy')
    bp = best_params[i]

    X_train = X_train.astype('float')
    X_train = normalize(X_train)
    Y_train = Y_train.astype('float')
    Y_train = Y_train.astype(int)

    clf = LinearSVC(C=bp['C'], max_iter=10000, tol=1e-4)
    clf_sigmoid = CalibratedClassifierCV(clf, cv=4, method='sigmoid').fit(
        X_train, Y_train.ravel())

    with open('Model_ism_linear' + str(i) + '.pkl', 'wb') as f:
        pickle.dump(clf_sigmoid, f)
Example #51
0
        'ngram_range': (1, 2)
    }, {
        'max_features': 3200,
        'ngram_range': (1, 1)
    }, {
        'max_features': 3200,
        'ngram_range': (1, 1)
    }, {
        'max_features': 3200,
        'ngram_range': (1, 1)
    }]

    models = [
        LogisticRegression(**model_params[0]),
        RandomForestClassifier(**model_params[1]),
        CalibratedClassifierCV(
            base_estimator=SGDClassifier(**model_params[2], max_iter=250)),
        MultinomialNB(**model_params[3])
    ]

    dataframe = pd.read_table(file)
    col = ['tweetID', 'text', 'relevant']
    df = dataframe[col]
    print(df.info())
    x = df.text
    y = df.relevant

    X_train, X_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.1,
                                                        random_state=42,
                                                        stratify=y)
kf = KFold(n_splits=5)
fold_count = 0
for train_index, test_index in kf.split(one_hot_encoding_df):
    print("Training Data: %d, Testing Data: %d" % (len(train_index), len(test_index)))
    train_X = one_hot_encoding_df.iloc[train_index, one_hot_encoding_df.columns != 'Attrition']
    train_y = one_hot_encoding_df.iloc[train_index]["Attrition"]
    test_X = one_hot_encoding_df.iloc[test_index, one_hot_encoding_df.columns != 'Attrition']
    test_y = one_hot_encoding_df.iloc[test_index]["Attrition"]


    ## model
    #clf = GaussianNB()
    
    clf = DecisionTreeClassifier(max_depth=2, min_samples_split=20, min_samples_leaf=20)
    model_isotonic = CalibratedClassifierCV(clf, cv=10, method='isotonic')
    #model_sigmoid = CalibratedClassifierCV(clf, cv=4, method='sigmoid')
    model = model_isotonic.fit(train_X, train_y)

    test_predict = model.predict(test_X)
    #avg_feature_importance.append(model.feature_importances_)
    
    acc, precision, recall, f1, matrix = evaluation(test_y, test_predict)
    
    print("Fold: %d, Accuracy: %f, Precision: %f, Recall: %f, F1: %f" % (fold_count + 1, round(acc, 3), round(precision, 3), round(recall, 3), round(f1, 3)))
    avg_acc += acc
    avg_precision += precision
    avg_recall += recall
    avg_f1 += f1
    avg_confusion_matrix.append(matrix)
    fold_count += 1
# default parameters
# SGDClassifier(loss=’hinge’, penalty=’l2’, alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None,
# shuffle=True, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, learning_rate=’optimal’, eta0=0.0, power_t=0.5,
# class_weight=None, warm_start=False, average=False, n_iter=None)

# some of methods
# fit(X, y[, coef_init, intercept_init, …])	Fit linear model with Stochastic Gradient Descent.
# predict(X)	Predict class labels for samples in X.

###############################################################################
log_error_array = []
for i in alpha:
    clf = SGDClassifier(alpha=i, penalty='l1', loss='hinge', random_state=42)
    clf.fit(X_train, y_train)
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
    sig_clf.fit(X_train, y_train)
    predict_y = sig_clf.predict_proba(X_test)
    log_error_array.append(
        log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
    print('For values of alpha = ', i, "The log loss is:",
          log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))

###############################################################################
fig, ax = plt.subplots()
ax.plot(alpha, log_error_array, c='g')
for i, txt in enumerate(np.round(log_error_array, 3)):
    ax.annotate((alpha[i], np.round(txt, 3)), (alpha[i], log_error_array[i]))

plt.grid()
plt.title("Cross Validation Error for each alpha")
Example #54
0
roc_accuracy_8 = 0;
p1_8 = 0
p0_8= 0
accuracy_9per = 0;
apc_9per = 0
roc_accuracy_9per = 0;
p1_9per = 0
p0_9per= 0
accuracy_9link = 0;
apc_9link = 0
roc_accuracy_9link = 0;
p1_9link = 0
p0_9link= 0
lin_clf = svm.LinearSVC(class_weight='balanced',dual=False, max_iter=100, random_state= 42949694)
mci = 0
clf = CalibratedClassifierCV(lin_clf)
for i in range(0,10):
    for j in range(0,5):
        train = pd.read_csv('fb'+str(i)+'train'+str(j)+'class_persistency.csv',delimiter='\t')
        test = pd.read_csv('fb'+str(i)+'test'+str(j)+'class_persistency.csv',delimiter='\t')
        Y_tr = train['status_class'].values
        del train['user1']
        del train['user2']
        del train['time']
        del train['status']
##        del train['class']
        del train['status_class']
        X_tr = train.values;
        Y_te_st = test['status_class'].values;
        Y_te_per = test['class_per'].values
        Y_te_li = test['class'].values
LR1 = LogisticRegression(penalty='l1', tol=0.01)
LR2 = LogisticRegression(penalty='l2', tol=0.01)
DT = DecisionTreeClassifier(random_state=0, max_depth=15, min_samples_leaf=2)
RF = RandomForestClassifier(max_depth=10, min_samples_split=2, n_estimators=100, random_state=1, verbose=True)
NN40 = MLPClassifier(solver='adam', alpha=1e-4, hidden_layer_sizes=(40,), random_state=1, activation='relu',
                   verbose=True, max_iter=20)

NN1600 = MLPClassifier(solver='adam', alpha=1e-4, hidden_layer_sizes=(1600,), random_state=1, activation='relu',
                   verbose=True, max_iter=20)

MLPclf = MLPClassifier(activation='relu', learning_rate='constant',
                       alpha=1e-4, hidden_layer_sizes=(80, 40), random_state=1, batch_size=1, verbose=False,
                       max_iter=20, warm_start=True)

clf = xgb.XGBClassifier()
metLearn = CalibratedClassifierCV(clf, method='isotonic', cv=2)

leanerSVML1 = LinearSVC(penalty='l1', loss='squared_hinge', dual=False,
                        random_state=0)
leanerSVML2 = LinearSVC(penalty='l2', loss='hinge', dual=True, random_state=0)

clf = svm.SVC(probability=True, verbose=True)

eclf1 = VotingClassifier(estimators=[('lr2', LR2), ('leanerSVML2', leanerSVML2), ('DT', DT)], voting='hard')

kf = KFold(n_splits=10, random_state=None, shuffle=False)


X = x_train.values
y = y_train.values
Example #56
0
def test_calibration(data, method, ensemble):
    # Test calibration objects with isotonic and sigmoid
    n_samples = 100
    X, y = data
    sample_weight = np.random.RandomState(seed=42).uniform(size=y.size)

    X -= X.min()  # MultinomialNB only allows positive X

    # split train and test
    X_train, y_train, sw_train = \
        X[:n_samples], y[:n_samples], sample_weight[:n_samples]
    X_test, y_test = X[n_samples:], y[n_samples:]

    # Naive-Bayes
    clf = MultinomialNB().fit(X_train, y_train, sample_weight=sw_train)
    prob_pos_clf = clf.predict_proba(X_test)[:, 1]

    cal_clf = CalibratedClassifierCV(clf, cv=y.size + 1, ensemble=ensemble)
    with pytest.raises(ValueError):
        cal_clf.fit(X, y)

    # Naive Bayes with calibration
    for this_X_train, this_X_test in [(X_train, X_test),
                                      (sparse.csr_matrix(X_train),
                                       sparse.csr_matrix(X_test))]:
        cal_clf = CalibratedClassifierCV(clf,
                                         method=method,
                                         cv=5,
                                         ensemble=ensemble)
        # Note that this fit overwrites the fit on the entire training
        # set
        cal_clf.fit(this_X_train, y_train, sample_weight=sw_train)
        prob_pos_cal_clf = cal_clf.predict_proba(this_X_test)[:, 1]

        # Check that brier score has improved after calibration
        assert (brier_score_loss(y_test, prob_pos_clf) > brier_score_loss(
            y_test, prob_pos_cal_clf))

        # Check invariance against relabeling [0, 1] -> [1, 2]
        cal_clf.fit(this_X_train, y_train + 1, sample_weight=sw_train)
        prob_pos_cal_clf_relabeled = cal_clf.predict_proba(this_X_test)[:, 1]
        assert_array_almost_equal(prob_pos_cal_clf, prob_pos_cal_clf_relabeled)

        # Check invariance against relabeling [0, 1] -> [-1, 1]
        cal_clf.fit(this_X_train, 2 * y_train - 1, sample_weight=sw_train)
        prob_pos_cal_clf_relabeled = cal_clf.predict_proba(this_X_test)[:, 1]
        assert_array_almost_equal(prob_pos_cal_clf, prob_pos_cal_clf_relabeled)

        # Check invariance against relabeling [0, 1] -> [1, 0]
        cal_clf.fit(this_X_train, (y_train + 1) % 2, sample_weight=sw_train)
        prob_pos_cal_clf_relabeled = cal_clf.predict_proba(this_X_test)[:, 1]
        if method == "sigmoid":
            assert_array_almost_equal(prob_pos_cal_clf,
                                      1 - prob_pos_cal_clf_relabeled)
        else:
            # Isotonic calibration is not invariant against relabeling
            # but should improve in both cases
            assert (brier_score_loss(y_test, prob_pos_clf) > brier_score_loss(
                (y_test + 1) % 2, prob_pos_cal_clf_relabeled))
        dim=obd.dim_context,
        random_state=random_state,
    )
    if counterfactual_policy != "logistic_ts":
        kwargs["epsilon"] = epsilon
    policy = counterfactual_policy_dict[counterfactual_policy](**kwargs)
    policy_name = f"{policy.policy_name}_{context_set}"

    # obtain batch logged bandit feedback generated by behavior policy
    bandit_feedback = obd.obtain_batch_bandit_feedback()
    # ground-truth policy value of the random policy
    # , which is the empirical mean of the factual (observed) rewards (on-policy estimation)
    ground_truth = bandit_feedback["reward"].mean()

    # a base ML model for regression model used in Direct Method and Doubly Robust
    base_model = CalibratedClassifierCV(
        HistGradientBoostingClassifier(**hyperparams))
    # run a counterfactual bandit algorithm on logged bandit feedback data
    selected_actions = run_bandit_simulation(bandit_feedback=bandit_feedback,
                                             policy=policy)
    # estimate the policy value of a given counterfactual algorithm by the three OPE estimators.
    ope = OffPolicyEvaluation(
        bandit_feedback=bandit_feedback,
        regression_model=RegressionModel(base_model=base_model),
        action_context=obd.action_context,
        ope_estimators=[
            InverseProbabilityWeighting(),
            DirectMethod(),
            DoublyRobust()
        ],
    )
    estimated_policy_value, estimated_interval = ope.summarize_off_policy_estimates(
Example #58
0
clf_probs = clf.predict_proba(X_test)
score = log_loss(y_test, clf_probs)

clf = DecisionTreeClassifier(
    criterion='entropy',
    min_samples_split=5,
    max_depth=40,
    max_features=30,
    random_state=2602,
)

# Train random forest classifier, calibrate on validation data and evaluate
# on test data
clf.fit(X_train, y_train)
clf_probs = clf.predict_proba(X_test)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid", cv="prefit")
sig_clf.fit(train, target)
sig_clf_probs = sig_clf.predict_proba(X_test)
sig_score = log_loss(y_test, sig_clf_probs)

print('\n-----------------------')
print('  logloss train: %.5f' % score)
print('  logloss valid: %.5f' % sig_score)
print('-----------------------')

# param_grid = {
#     'n_estimators': [10],
#     'max_features': ['auto', 2, 30],
#     'min_samples_leaf': [2, 8],
#     'max_leaf_nodes': [2, 8],
#     'min_samples_split': [2, 5],
X_train_valid, y_train_valid = X[:800], y[:800]
X_test, y_test = X[800:], y[800:]

# Train uncalibrated random forest classifier on whole train and validation
# data and evaluate on test data
clf = RandomForestClassifier(n_estimators=25)
clf.fit(X_train_valid, y_train_valid)
clf_probs = clf.predict_proba(X_test)
score = log_loss(y_test, clf_probs)

# Train random forest classifier, calibrate on validation data and evaluate
# on test data
clf = RandomForestClassifier(n_estimators=25)
clf.fit(X_train, y_train)
clf_probs = clf.predict_proba(X_test)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid", cv="prefit")
sig_clf.fit(X_valid, y_valid)
sig_clf_probs = sig_clf.predict_proba(X_test)
sig_score = log_loss(y_test, sig_clf_probs)

# Plot changes in predicted probabilities via arrows
plt.figure()
colors = ["r", "g", "b"]
for i in range(clf_probs.shape[0]):
    plt.arrow(clf_probs[i, 0],
              clf_probs[i, 1],
              sig_clf_probs[i, 0] - clf_probs[i, 0],
              sig_clf_probs[i, 1] - clf_probs[i, 1],
              color=colors[y_test[i]],
              head_width=1e-2)
Example #60
0
	def BuildModel_Apply_Performance(df, clf, cv_num, df_notSel, apply_unk,
		df_unknowns, test_df, classes, POS, NEG, j, ALG, THRSHD_test, save):
		from sklearn.model_selection import cross_val_predict

		# Data from balanced dataframe
		y = df['Class']
		X = df.drop(['Class'], axis=1)

		# For LinearSVM need to have calibrated classifier to get probability
		# scores, but not for importance scores
		if ALG.lower() == 'svm':
			from sklearn.calibration import CalibratedClassifierCV
			clf2 = clf
			clf2.fit(X,y)
			# adds the probability output to linearSVC
			clf = CalibratedClassifierCV(clf, cv=3)
		else:
			clf2 = 'pass'

		# Obtain the predictions using 10 fold cross validation
		# (uses KFold cv by default):
		cv_proba = cross_val_predict(estimator=clf, X=X, y=y, cv=int(cv_num),
			method='predict_proba')
		cv_pred = cross_val_predict(estimator=clf, X=X, y=y, cv=cv_num)

		# Fit a model using all data and apply to
		# (1) instances that were not selected using cl_train
		# (2) instances with unknown class
		# (3) test instances
		clf.fit(X,y)

		# Save model for future persistence
		print(f'\nSaving model as {save+".joblib"}\n')
		dump(clf, save+'.joblib')

		notSel_proba = clf.predict_proba(df_notSel.drop(['Class'], axis=1))
		if apply_unk == True:
			unk_proba = clf.predict_proba(df_unknowns.drop(['Class'], axis=1))
		if not isinstance(test_df, str):
			test_proba = clf.predict_proba(test_df.drop(['Class'], axis=1))
			test_pred = clf.predict(test_df.drop(['Class'], axis=1))

		# Evaluate performance
		if len(classes) == 2:
			i = 0
			for clss in classes:
				if clss == POS:
					POS_IND = i
					break
				i += 1
			scores = cv_proba[:, POS_IND]

			# Generate run statistics from balanced dataset scores
			result = fun.Performance(y, cv_pred, scores, clf, clf2, classes,
				POS, POS_IND, NEG, ALG, THRSHD_test)

			#Generate data frame with all scores
			score_columns=["score_%s"%(j)]
			df_sel_scores = pd.DataFrame(data=cv_proba[:, POS_IND],
				index=df.index, columns=score_columns)
			df_notSel_scores = pd.DataFrame(data=notSel_proba[:,POS_IND],
				index=df_notSel.index, columns=score_columns)
			current_scores = pd.concat([df_sel_scores, df_notSel_scores],
				axis=0)
			if apply_unk == True:
				df_unk_scores = pd.DataFrame(data=unk_proba[:, POS_IND],
					index=df_unknowns.index, columns=score_columns)
				current_scores =  pd.concat([current_scores,df_unk_scores],
					axis=0)
			if not isinstance(test_df, str):
				df_test_scores = pd.DataFrame(data=test_proba[:,POS_IND],
					index=test_df.index, columns=score_columns)
				current_scores =  pd.concat([current_scores, df_test_scores],
					axis=0)
				scores_test = test_proba[:,POS_IND]
				result_test = fun.Performance(test_df['Class'], test_pred,
					scores_test, clf, clf2, classes, POS, POS_IND, NEG, ALG,
					THRSHD_test)

		else:
			# Generate run statistics from balanced dataset scores
			result = fun.Performance_MC(y, cv_pred, classes)

			#Generate data frame with all scores
			score_columns = []
			for clss in classes:
				score_columns.append("%s_score_%s"%(clss, j))

			df_sel_scores = pd.DataFrame(data=cv_proba, index=df.index,
				columns=score_columns)
			df_notSel_scores = pd.DataFrame(data=notSel_proba,
				index=df_notSel.index, columns=score_columns)
			current_scores = pd.concat([df_sel_scores, df_notSel_scores],
				axis=0)
			if apply_unk:
				df_unk_scores = pd.DataFrame(data=unk_proba,
					index=df_unknowns.index, columns=score_columns)
				current_scores =  pd.concat([current_scores, df_unk_scores],
					axis=0)
			if not isinstance(test_df, str):
				df_test_scores = pd.DataFrame(data=test_proba,
					index=test_df.index, columns=score_columns)
				current_scores = pd.concat([current_scores, df_test_scores],
					axis=0)
				result_test = fun.Performance_MC(test_df['Class'], test_pred,
					classes)

		if not isinstance(test_df, str):
			return result,current_scores,result_test
		else:
			return result,current_scores