Exemple #1
0
    def KFold_method(self):
        
        kf = KFold(n_splits=10)
        for train_index, test_index in kf.split(self.FeatureSet):
            X_train = []
            X_test = []
            y_train = []
            y_test = []
            for trainid in train_index.tolist():
                X_train.append(self.FeatureSet[trainid])
                y_train.append(self.Label[trainid])

            for testid in test_index.tolist():
                X_test.append(self.FeatureSet[testid])
                y_test.append(self.Label[testid])
            #clf = tree.DecisionTreeClassifier()        
            #clf = clf.fit(X_train, y_train)
            #pre_labels = clf.predict(X_test)
            clf = AdaBoostClassifier(n_estimators=100)
            clf = clf.fit(X_train, y_train)
            pre_labels = clf.predict(X_test)
            # Modeal Evaluation
            ACC = metrics.accuracy_score(y_test, pre_labels)
            MCC = metrics.matthews_corrcoef(y_test, pre_labels)
            SN = self.performance(y_test, pre_labels)
            print ACC, SN
def calculate_val(thresholds, embeddings1, embeddings2, actual_issame, far_target, nrof_folds=10):
    assert(embeddings1.shape[0] == embeddings2.shape[0])
    assert(embeddings1.shape[1] == embeddings2.shape[1])
    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
    nrof_thresholds = len(thresholds)
    k_fold = KFold(n_splits=nrof_folds, shuffle=False)
    
    val = np.zeros(nrof_folds)
    far = np.zeros(nrof_folds)
    
    diff = np.subtract(embeddings1, embeddings2)
    dist = np.sum(np.square(diff),1)
    indices = np.arange(nrof_pairs)
    
    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
      
        # Find the threshold that gives FAR = far_target
        far_train = np.zeros(nrof_thresholds)
        for threshold_idx, threshold in enumerate(thresholds):
            _, far_train[threshold_idx] = calculate_val_far(threshold, dist[train_set], actual_issame[train_set])
        if np.max(far_train)>=far_target:
            f = interpolate.interp1d(far_train, thresholds, kind='slinear')
            threshold = f(far_target)
        else:
            threshold = 0.0
    
        val[fold_idx], far[fold_idx] = calculate_val_far(threshold, dist[test_set], actual_issame[test_set])
  
    val_mean = np.mean(val)
    far_mean = np.mean(far)
    val_std = np.std(val)
    return val_mean, val_std, far_mean
def test_cross_val_predict_with_method():
    iris = load_iris()
    X, y = iris.data, iris.target
    X, y = shuffle(X, y, random_state=0)
    classes = len(set(y))

    kfold = KFold(len(iris.target))

    methods = ['decision_function', 'predict_proba', 'predict_log_proba']
    for method in methods:
        est = LogisticRegression()

        predictions = cross_val_predict(est, X, y, method=method)
        assert_equal(len(predictions), len(y))

        expected_predictions = np.zeros([len(y), classes])
        func = getattr(est, method)

        # Naive loop (should be same as cross_val_predict):
        for train, test in kfold.split(X, y):
            est.fit(X[train], y[train])
            expected_predictions[test] = func(X[test])

        predictions = cross_val_predict(est, X, y, method=method,
                                        cv=kfold)
        assert_array_almost_equal(expected_predictions, predictions)
Exemple #4
0
    def cross_validate(self, values_labels, folds=10, processes=1):
        """
        Trains and tests the model agaists folds of labeled data.

        :Parameters:
            values_labels : [( `<feature_values>`, `<label>` )]
                an iterable of labeled data Where <values_labels> is an ordered
                collection of predictive values that correspond to the
                `Feature` s provided to the constructor
            folds : `int`
                When set to 1, cross-validation will run in the parent thread.
                When set to 2 or greater, a :class:`multiprocessing.Pool` will
                be created.
        """
        folds_i = KFold(n_splits=folds, shuffle=True,
                        random_state=0)
        if processes == 1:
            mapper = map
        else:
            pool = Pool(processes=processes or cpu_count())
            mapper = pool.map
        results = mapper(self._cross_score,
                         ((i, [values_labels[i] for i in train_i],
                           [values_labels[i] for i in test_i])
                          for i, (train_i, test_i) in enumerate(
                              folds_i.split(values_labels))))
        agg_score_labels = []
        for score_labels in results:
            agg_score_labels.extend(score_labels)

        self.info['statistics'].fit(agg_score_labels)

        return self.info['statistics']
def calculate_roc(thresholds, embeddings1, embeddings2, actual_issame, nrof_folds=10):
    assert(embeddings1.shape[0] == embeddings2.shape[0])
    assert(embeddings1.shape[1] == embeddings2.shape[1])
    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
    nrof_thresholds = len(thresholds)
    k_fold = KFold(n_splits=nrof_folds, shuffle=False)
    
    tprs = np.zeros((nrof_folds,nrof_thresholds))
    fprs = np.zeros((nrof_folds,nrof_thresholds))
    accuracy = np.zeros((nrof_folds))
    
    diff = np.subtract(embeddings1, embeddings2)
    dist = np.sum(np.square(diff),1)
    indices = np.arange(nrof_pairs)
    
    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
        
        # Find the best threshold for the fold
        acc_train = np.zeros((nrof_thresholds))
        for threshold_idx, threshold in enumerate(thresholds):
            _, _, acc_train[threshold_idx] = calculate_accuracy(threshold, dist[train_set], actual_issame[train_set])
        best_threshold_index = np.argmax(acc_train)
        for threshold_idx, threshold in enumerate(thresholds):
            tprs[fold_idx,threshold_idx], fprs[fold_idx,threshold_idx], _ = calculate_accuracy(threshold, dist[test_set], actual_issame[test_set])
        _, _, accuracy[fold_idx] = calculate_accuracy(thresholds[best_threshold_index], dist[test_set], actual_issame[test_set])
          
    tpr = np.mean(tprs,0)
    fpr = np.mean(fprs,0)
    return tpr, fpr, accuracy
Exemple #6
0
def CV_mean(X_slct, y, test_slct, model_name='RandomForest',
            model_obj=sk_ens.RandomForestRegressor, model_params=rf_params, 
            eval_func=r2_score, nFolds=5, gen_rand_func=gen_rand):
    k_fold = KFold(n_splits=nFolds, shuffle=True, random_state=gen_rand_func())
    cv_scores = []
    model_li = []
    preds = []
    for train_index, test_index in k_fold.split(X_slct, y):
        X_train, X_test = X_slct[train_index,:], X_slct[test_index,:]
        y_train, y_test = y[train_index], y[test_index]
        if 'random_state' in model_params:
            model_params['random_state'] = gen_rand_func()
        elif 'seed' in model_params:
            model_params['seed'] = gen_rand_func()
        model = model_obj(**model_params)
        model.fit(X_train, y_train)
        scr = eval_func(y_test, model.predict(X_test))
        print('Score of ' + model_name + ':', scr)
        model_li.append(model)
        cv_scores.append(scr)
        pred = model.predict(test_slct)
        preds.append(pred)
    plt.plot(cv_scores); plt.show()
    winner_pred = preds[cv_scores.index(max(cv_scores))]
    print('CV_mean ' + model_name + ':', np.mean(cv_scores))
    return np.mean(cv_scores), winner_pred
def calculate_val(thresholds, embeddings1, embeddings2, actual_issame, far_target, nrof_folds=10, distance_metric=0, subtract_mean=False):
    assert(embeddings1.shape[0] == embeddings2.shape[0])
    assert(embeddings1.shape[1] == embeddings2.shape[1])
    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
    nrof_thresholds = len(thresholds)
    k_fold = KFold(n_splits=nrof_folds, shuffle=False)
    
    val = np.zeros(nrof_folds)
    far = np.zeros(nrof_folds)
    
    indices = np.arange(nrof_pairs)
    
    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
        if subtract_mean:
            mean = np.mean(np.concatenate([embeddings1[train_set], embeddings2[train_set]]), axis=0)
        else:
          mean = 0.0
        dist = distance(embeddings1-mean, embeddings2-mean, distance_metric)
      
        # Find the threshold that gives FAR = far_target
        far_train = np.zeros(nrof_thresholds)
        for threshold_idx, threshold in enumerate(thresholds):
            _, far_train[threshold_idx] = calculate_val_far(threshold, dist[train_set], actual_issame[train_set])
        if np.max(far_train)>=far_target:
            f = interpolate.interp1d(far_train, thresholds, kind='slinear')
            threshold = f(far_target)
        else:
            threshold = 0.0
    
        val[fold_idx], far[fold_idx] = calculate_val_far(threshold, dist[test_set], actual_issame[test_set])
  
    val_mean = np.mean(val)
    far_mean = np.mean(far)
    val_std = np.std(val)
    return val_mean, val_std, far_mean
def calculate_roc(thresholds, embeddings1, embeddings2, actual_issame, nrof_folds=10, distance_metric=0, subtract_mean=False):
    assert(embeddings1.shape[0] == embeddings2.shape[0])
    assert(embeddings1.shape[1] == embeddings2.shape[1])
    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
    nrof_thresholds = len(thresholds)
    k_fold = KFold(n_splits=nrof_folds, shuffle=False)
    
    tprs = np.zeros((nrof_folds,nrof_thresholds))
    fprs = np.zeros((nrof_folds,nrof_thresholds))
    accuracy = np.zeros((nrof_folds))
    
    indices = np.arange(nrof_pairs)
    
    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
        if subtract_mean:
            mean = np.mean(np.concatenate([embeddings1[train_set], embeddings2[train_set]]), axis=0)
        else:
          mean = 0.0
        dist = distance(embeddings1-mean, embeddings2-mean, distance_metric)
        
        # Find the best threshold for the fold
        acc_train = np.zeros((nrof_thresholds))
        for threshold_idx, threshold in enumerate(thresholds):
            _, _, acc_train[threshold_idx] = calculate_accuracy(threshold, dist[train_set], actual_issame[train_set])
        best_threshold_index = np.argmax(acc_train)
        for threshold_idx, threshold in enumerate(thresholds):
            tprs[fold_idx,threshold_idx], fprs[fold_idx,threshold_idx], _ = calculate_accuracy(threshold, dist[test_set], actual_issame[test_set])
        _, _, accuracy[fold_idx] = calculate_accuracy(thresholds[best_threshold_index], dist[test_set], actual_issame[test_set])
          
        tpr = np.mean(tprs,0)
        fpr = np.mean(fprs,0)
    return tpr, fpr, accuracy
def predict_model_kfold(name,path,features_type,label_name,data):
    kfold = KFold(10, True)
    #RandomForest -I 1000 -K 0 -S 1 -num-slots 1
    model = BalancedRandomForestClassifier(n_estimators=1000,max_depth=5)
    index = 0
    size = data.shape[0]
    all_predictions = 0
    x = data.drop('hasBug', axis=1)
    y = data['hasBug']
    num_of_bugs = data.loc[data['hasBug'] == 1].shape[0]
    num_of_all_instances = data.shape[0]
    bug_precent = float(num_of_bugs) / float(num_of_all_instances)
    for train, test in kfold.split(data):
        index += 1
        prediction_train = model.fit(x.iloc[train], y.iloc[train]).predict(x.iloc[test])
        all_predictions += create_all_eval_results(False,y.iloc[test],prediction_train,name,"training",features_type,num_of_bugs,num_of_all_instances,bug_precent,None)

    all_predictions /= index
    start_list = [name,"training",features_type,"sklearn - python"]
    result_list = start_list+ all_predictions.tolist()

    global results_all_projects
    results_all_projects.loc[len(results_all_projects)] = result_list

    model.fit(x,y)
    return model
Exemple #10
0
    def KFold_method(self):
        kf = KFold(n_splits=10)
        for train_index, test_index in kf.split(self.FeatureSet):
            X_train = []
            X_test = []
            y_train = []
            y_test = []
            for trainid in train_index.tolist():
                X_train.append(self.FeatureSet[trainid])
                y_train.append(self.Label[trainid])

            for testid in test_index.tolist():
                X_test.append(self.FeatureSet[testid])
                y_test.append(self.Label[testid])

            tree = self.buildtree(X_train)
            #self.post_pruning(tree, 0.3)
            pre_labels = self.predict(X_test, tree)

            # Modeal Evaluation
            ACC = metrics.accuracy_score(y_test, pre_labels)
        #    MCC = metrics.matthews_corrcoef(y_test, pre_labels)
            SN = self.performance(y_test, pre_labels)
        #    print SP, SN
            print ACC, SN
Exemple #11
0
 def hyperopt_obj(self,param,train_X,train_y):
     # 5-fold crossvalidation error
     #ret = xgb.cv(param,dtrain,num_boost_round=param['num_round'])
     kf = KFold(n_splits = 3)
     errors = []
     r2 = []
     int_params = ['max_depth','num_round']
     for item in int_params:
         param[item] = int(param[item])
     for train_ind,test_ind in kf.split(train_X):
         train_valid_x,train_valid_y = train_X[train_ind],train_y[train_ind]
         test_valid_x,test_valid_y = train_X[test_ind],train_y[test_ind]
         dtrain = xgb.DMatrix(train_valid_x,label = train_valid_y)
         dtest = xgb.DMatrix(test_valid_x)
         pred_model = xgb.train(param,dtrain,num_boost_round=int(param['num_round']))
         pred_test = pred_model.predict(dtest)
         errors.append(mean_squared_error(test_valid_y,pred_test))
         r2.append(r2_score(test_valid_y,pred_test))
     all_dtrain = xgb.DMatrix(train_X,label = train_y)
     print('training score:')
     pred_model = xgb.train(param,all_dtrain,num_boost_round= int(param['num_round']))
     all_dtest = xgb.DMatrix(train_X)
     pred_train = pred_model.predict(all_dtest)
     print(str(r2_score(train_y,pred_train)))
     print(np.mean(r2))
     print('\n')
     return {'loss':np.mean(errors),'status': STATUS_OK}
Exemple #12
0
def computing_cv_accuracy_imprecise(in_path=None, ell_optimal=0.1, cv_n_fold=10):
    def u65(mod_Y):
        return 1.6 / mod_Y - 0.6 / mod_Y ** 2

    def u80(mod_Y):
        return 2.2 / mod_Y - 1.2 / mod_Y ** 2

    data = export_data_set('iris.data') if in_path is None else pd.read_csv(in_path)
    print("-----DATA SET TRAINING---", in_path)
    X = data.iloc[:, :-1].values
    y = np.array(data.iloc[:, -1].tolist())
    mean_u65, mean_u80 = 0, 0
    lqa = LinearDiscriminant(init_matlab=True)
    kf = KFold(n_splits=cv_n_fold, random_state=None, shuffle=True)
    for idx_train, idx_test in kf.split(y):
        X_cv_train, y_cv_train = X[idx_train], y[idx_train]
        X_cv_test, y_cv_test = X[idx_test], y[idx_test]
        lqa.learn(X_cv_train, y_cv_train, ell=ell_optimal)
        sum_u65, sum_u80 = 0, 0
        n_test, _ = X_cv_test.shape
        for i, test in enumerate(X_cv_test):
            print("--TESTING-----", i, ell_optimal)
            evaluate, _ = lqa.evaluate(test)
            print(evaluate, "-----", y_cv_test[i])
            if y_cv_test[i] in evaluate:
                sum_u65 += u65(len(evaluate))
                sum_u80 += u80(len(evaluate))
        mean_u65 += sum_u65 / n_test
        mean_u80 += sum_u80 / n_test
    mean_u65 = mean_u65 / cv_n_fold
    mean_u80 = mean_u80 / cv_n_fold
    print("--ell-->", ell_optimal, "--->", mean_u65, mean_u80)
Exemple #13
0
def test_regression_with_custom_objective():
    from sklearn.metrics import mean_squared_error
    from sklearn.datasets import load_boston
    from sklearn.model_selection import KFold

    def objective_ls(y_true, y_pred):
        grad = (y_pred - y_true)
        hess = np.ones(len(y_true))
        return grad, hess

    boston = load_boston()
    y = boston['target']
    X = boston['data']
    kf = KFold(n_splits=2, shuffle=True, random_state=rng)
    for train_index, test_index in kf.split(X, y):
        xgb_model = xgb.XGBRegressor(objective=objective_ls).fit(
            X[train_index], y[train_index]
        )
        preds = xgb_model.predict(X[test_index])
        labels = y[test_index]
    assert mean_squared_error(preds, labels) < 25

    # Test that the custom objective function is actually used
    class XGBCustomObjectiveException(Exception):
        pass

    def dummy_objective(y_true, y_pred):
        raise XGBCustomObjectiveException()

    xgb_model = xgb.XGBRegressor(objective=dummy_objective)
    np.testing.assert_raises(XGBCustomObjectiveException, xgb_model.fit, X, y)
Exemple #14
0
def validateseq2(X_all, y, features, clf, score, v = False, esr=50, sk=5):
    temp_user = target_order[(target_order.o_day_series < 336) & (target_order.o_day_series >= 274)][['user_id']].drop_duplicates().reset_index(drop=True)
    temp_user['CreateGroup'] = 336
    print('before delete: {}'.format(X_all.shape))
    X = temp_user.merge(X_all,on=['user_id','CreateGroup'],how = 'left')
    print('after delete: {}'.format(X.shape))
    temp_user = target_order[(target_order.o_day_series < 306) & (target_order.o_day_series >= 215)][['user_id']].drop_duplicates().reset_index(drop=True)
    temp_user['CreateGroup'] = 306
    print('before delete: {}'.format(X_all.shape))
    X2 = temp_user.merge(X_all,on=['user_id','CreateGroup'],how = 'left')
    print('after delete: {}'.format(X.shape))
    kf = KFold(n_splits=sk)
    print(len(features))
    X['Prob_x'] = 0
    for train_index, test_index in kf.split(X2):
        X_train, X_test = X2.ix[train_index,:], X2.ix[test_index,:]
        X_train, X_test = X_train[features], X_test[features]
        y_train, y_test = X2.ix[train_index,:].buy, X2.ix[test_index,:].buy
        clf.fit(X_train,y_train, eval_set = [(X_train, y_train), (X_test, y_test)], eval_metric='auc', verbose=v, early_stopping_rounds=esr)
        X['Prob_x'] = X['Prob_x'] + clf.predict_proba(X[features])[:,1]/sk
    Performance = []
    features.append('Prob_x')
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.ix[train_index,:], X.ix[test_index,:]
        X_train, X_test = X_train[features], X_test[features]
        y_train, y_test = X.ix[train_index,:].buy, X.ix[test_index,:].buy
        clf.fit(X_train,y_train, eval_set = [(X_train, y_train), (X_test, y_test)], eval_metric='auc', verbose=v, early_stopping_rounds=esr)
        pred = clf.predict_proba(X_test)[:,1]
        Performance.append(roc_auc_score(y_test,pred))
    print("Mean Score: {}".format(np.mean(Performance)))
    return np.mean(Performance),clf
    def select(self):  
           
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        # Implement model selection using CV
        NB_SPLITS = 3   
        mean_scores = []
        split_method = KFold(random_state=self.random_state, n_splits=NB_SPLITS)
        n_components = range(self.min_n_components, self.max_n_components + 1)
        
        try:
            for n_component in n_components:
                model = self.base_model(n_component)
                kfold_scores = []
                for _, test_idx in split_method.split(self.sequences):
                    test_X, test_length = combine_sequences(test_idx, self.sequences)
                    kfold_scores.append(model.score(test_X, test_length))
                    
                mean_scores.append(np.mean(kfold_scores))
                
        except Exception as e:
            pass
        
        if len(mean_scores) > 0:
            states = n_components[np.argmax(mean_scores)]
        else:
            states = self.n_constant

        return self.base_model(states)
Exemple #16
0
def test_multiclass_classification():
    from sklearn.datasets import load_iris
    from sklearn.model_selection import KFold

    def check_pred(preds, labels, output_margin):
        if output_margin:
            err = sum(1 for i in range(len(preds))
                      if preds[i].argmax() != labels[i]) / float(len(preds))
        else:
            err = sum(1 for i in range(len(preds))
                      if preds[i] != labels[i]) / float(len(preds))
        assert err < 0.4

    iris = load_iris()
    y = iris['target']
    X = iris['data']
    kf = KFold(n_splits=2, shuffle=True, random_state=rng)
    for train_index, test_index in kf.split(X, y):
        xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index])
        preds = xgb_model.predict(X[test_index])
        # test other params in XGBClassifier().fit
        preds2 = xgb_model.predict(X[test_index], output_margin=True,
                                   ntree_limit=3)
        preds3 = xgb_model.predict(X[test_index], output_margin=True,
                                   ntree_limit=0)
        preds4 = xgb_model.predict(X[test_index], output_margin=False,
                                   ntree_limit=3)
        labels = y[test_index]

        check_pred(preds, labels, output_margin=False)
        check_pred(preds2, labels, output_margin=True)
        check_pred(preds3, labels, output_margin=True)
        check_pred(preds4, labels, output_margin=False)
def original_data():
    for target in TARGETS:
        for algo_str in ALGORITHMS:
            algorithm = importlib.import_module('src.multi_class.' + algo_str)
            encoded_data = input_preproc.readFromDataset(
                INPUT_DIR + ORIGINAL_DATA_FILE,
                INPUT_COLS['original'],
                target
            )
            # Split into predictors and target
            X = np.array(encoded_data[encoded_data.columns.difference([target])])
            y = np.array(encoded_data[target])
            kf = KFold(n_splits=CROSS_VALIDATION_K, shuffle=True)

            f1s = []

            for train_index, test_index in kf.split(X):
                X_train, y_train = X[train_index], y[train_index]
                X_test, y_test = X[test_index], y[test_index]

                scaler = preprocessing.StandardScaler()
                X_train = pd.DataFrame(scaler.fit_transform(X_train))  # , columns=X_train.columns)
                X_test = scaler.transform(X_test)

                precision, recall, f1_score, accuracy = algorithm.runClassifier(X_train, X_test, y_train, y_test)
                f1s.append(f1_score)

            final_f1 = sum(f1s) / len(f1s)
            print("\n================================")
            print("%s, %s, F1 Score: %.6f" % (target, algo_str, final_f1))
            print("================================\n")
Exemple #18
0
    def _iter_test_masks(self, X, y=None, groups=None):
        # yields mask array for test splits
        n_samples = X.shape[0]

        # if groups is not specified, an entire data is specified as one group
        if groups is None:
            groups = np.zeros(n_samples, dtype=int)

        # constants
        indices = np.arange(n_samples)
        test_fold = np.empty(n_samples, dtype=bool)
        rng = check_random_state(self.random_state)
        group_indices = np.unique(groups)
        iters = np.empty(group_indices.shape[0], dtype=object)

        # generate iterators
        cv = KFold(self.n_splits, self.shuffle, rng)
        for i, g in enumerate(group_indices):
            group_member = indices[groups == g]
            iters[i] = cv.split(group_member)

        # generate training and test splits
        for fold in xrange(self.n_splits):
            test_fold[:] = False
            for i, g in enumerate(group_indices):
                group_train_i, group_test_i = next(iters[i])
                test_fold[indices[groups == g][group_test_i]] = True
            yield test_fold
Exemple #19
0
def kFolds(dataSet, k = 10):
    """
    This is the k-fold method
    :param dataSet: of type DataFrame
    :param k: number of subsets to choose
    """
    df_mx = dataSet.as_matrix()
    X = df_mx[:, 1:16]
    Y = df_mx[:, 0:1]

    lm = svm.SVC(gamma=0.001, C=100.)  # Support Vector Machine
    kf = KFold(n_splits=10)  # Define the split - into 10 folds
    i = 0
    accuracies = numpy.zeros(kf.get_n_splits(X))
    for train_index, test_index in kf.split(X):
        print("{}. TRAIN: {} TEST: {}".format(i+1, train_index, test_index))
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        # train using X_Train
        model = lm.fit(X_train, Y_train)
        # evaluate against X_Test
        predictions = lm.predict(X_test)
        # save accuracy
        accuracies[i] = model.score(X_test, Y_test)
        i = i + 1

    # find mean accuracy over all rounds
    print("Average accuracy of K-Folds (k={}): {}%".format(numpy.mean(accuracies) * 100, k))
Exemple #20
0
    def model_train(self, X_train, y_train, ignore_neutral=False):
        if ignore_neutral:
            X_train = X_train[y_train != 0]
            y_train = y_train[y_train != 0]
        self.ignore_neutral = ignore_neutral

        model = LinearSVC()
        classifier = model.fit(X_train, y_train)
        # pred = classifier.predict(X_train)
        # accu = np.mean(pred == y_train)
        # print 'The accuracy of training data is {}'.format(accu)
        # print confusion_matrix(y_train, pred)

        # k-fold
        kfold = KFold(n_splits=5)
        for i, (train_index, test_index) in enumerate((kfold.split(X_train))):
            X_split_train = X_train[train_index]
            y_split_train = y_train[train_index]
            X_split_valid = X_train[test_index]
            y_split_valid = y_train[test_index]
            classifier = model.fit(X_split_train, y_split_train)
            pred = classifier.predict(X_split_valid)
            accu = np.mean(pred == y_split_valid)
            print 'Fold {} : the accuracy of validation data is {}'.format(i + 1, accu)

        return classifier
Exemple #21
0
def test_boston_housing_regression():
    from sklearn.metrics import mean_squared_error
    from sklearn.datasets import load_boston
    from sklearn.model_selection import KFold

    boston = load_boston()
    y = boston['target']
    X = boston['data']
    kf = KFold(n_splits=2, shuffle=True, random_state=rng)
    for train_index, test_index in kf.split(X, y):
        xgb_model = xgb.XGBRegressor().fit(X[train_index], y[train_index])

        preds = xgb_model.predict(X[test_index])
        # test other params in XGBRegressor().fit
        preds2 = xgb_model.predict(X[test_index], output_margin=True,
                                   ntree_limit=3)
        preds3 = xgb_model.predict(X[test_index], output_margin=True,
                                   ntree_limit=0)
        preds4 = xgb_model.predict(X[test_index], output_margin=False,
                                   ntree_limit=3)
        labels = y[test_index]

        assert mean_squared_error(preds, labels) < 25
        assert mean_squared_error(preds2, labels) < 350
        assert mean_squared_error(preds3, labels) < 25
        assert mean_squared_error(preds4, labels) < 350
Exemple #22
0
def Get_KFolds(data, y_label, num_folds, scale):
    #Creates 5 folds from the train/test set each with a separate training and test set
    folds = []
    kf = KFold(n_splits = num_folds)
    for train_index, test_index in kf.split(data):
        training = []
        test = []
        
        tempdf = Normalize_Scale(data,scale)
        train_x = tempdf.drop([y_label], axis=1).values
        train_y = tempdf[y_label].values
        
        #Creates a training set within the fold
        x = []
        y = []
        
        for index in train_index:
            x.append(train_x[index])
            y.append(train_y[index])
        training = [x,y]
        
        #Creates a test set within the fold
        x = []
        y = []
        for index in test_index:
            x.append(train_x[index])
            y.append(train_y[index])
        test = [x,y]

        folds.append([training,test])
    
    return folds
Exemple #23
0
def learn_decision_tree(data_set, label):
	#Create depths 
	depths = list(range(1,14))
	#Initialize the best model
	best_model = [None, 0, float("-inf")]
	#Create 13-fold
	kf = KFold(n_splits=13)
	track = []
	for (train, test), cdepth in zip(kf.split(data_set), depths):
        #Get training set
		train_set = [data_set[i] for i in train]
		train_label = [label[i] for i in train]
		#Get validation set
		valid_set = [data_set[i] for i in test]
		valid_label = [label[i] for i in test]
		#Learn the decision tree from data
		clf = tree.DecisionTreeClassifier(max_depth=cdepth)
		clf = clf.fit(train_set, train_label)
		#Get accuracy from the model
		accuraclabel = clf.score(valid_set, valid_label)
		#Compare accuracies
		track.append([cdepth, accuraclabel])
		if accuraclabel > best_model[2]:
			#Update the best model
			best_model = [clf, cdepth, accuraclabel]
	#Plot the graph
	fig = plt.figure()
	x = [x[0] for x in track]
	y = [x[1] for x in track]
	plt.xlabel('Depth')
	plt.ylabel('Accuracy')
	plt.title('Decision Tree')
	plt.plot(x,y)
	plt.savefig('decision_tree.png')
	return best_model
Exemple #24
0
def predict(X_all, X_new, features, clf, score, v = False, esr=50, sk=3, fn='submission'):
    first_day = datetime.datetime.strptime('2017-08-31 00:00:00', '%Y-%m-%d %H:%M:%S')
    temp_user = target_order[(target_order.o_day_series < 336) & (target_order.o_day_series >= 274)][['user_id']].drop_duplicates().reset_index(drop=True)
    temp_user['CreateGroup'] = 336
    print('before delete: {}'.format(X_all.shape))
    X = temp_user.merge(X_all,on=['user_id','CreateGroup'],how = 'left')
    print('after delete: {}'.format(X.shape))
    temp_user = target_order[(target_order.o_day_series < 366) & \
                             (target_order.o_day_series >= 366 - 75)][['user_id']].drop_duplicates().reset_index(drop=True)
    temp_user['CreateGroup'] = 366
    print('before delete: {}'.format(X_new.shape))
    X_new = temp_user.merge(X_new,on=['user_id','CreateGroup'],how = 'left')
    print('Train: {}'.format(X_new.shape))
    kf = KFold(n_splits=sk)
    print(len(features))
    Performance = []
    X_new['Prob'] = 0
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.ix[train_index,:], X.ix[test_index,:]
        X_train, X_test = X_train[features], X_test[features]
        y_train, y_test = X.ix[train_index,:].buy, X.ix[test_index,:].buy
        clf.fit(X_train,y_train, eval_set = [(X_train, y_train), (X_test, y_test)], eval_metric='auc', verbose=v, early_stopping_rounds=esr)
        pred = clf.predict_proba(X_test)[:,1]
        X_new['Prob'] = X_new['Prob'] + clf.predict_proba(X_new[features])[:,1]/sk
        Performance.append(roc_auc_score(y_test,pred))
    print("Mean Score: {}".format(np.mean(Performance)))
    X_new['Days'] = np.random.randint(15,size=len(X_new))
    X_new['pred_date'] = X_new['Days'].apply(lambda x: (datetime.timedelta(days=x) + first_day).strftime("%Y-%m-%d"))
    X_new.sort_values(by = ['Prob'], ascending = False, inplace = True)
    X_new[['user_id','Prob']].to_csv('prob_{}.csv'.format(fn), index = None)
    X_new[['user_id','pred_date']][:50000].to_csv('{}.csv'.format(fn), index = None)
    return np.mean(Performance),clf
Exemple #25
0
def cross_validation(train_data, train_labels, k_range=np.arange(1,16)):
    '''
    Perform 10-fold cross validation to find the best value for k

    Note: Previously this function took knn as an argument instead of train_data,train_labels.
    The intention was for students to take the training data from the knn object - this should be clearer
    from the new function signature.
    '''
    folds = 10
    kf = KFold(n_splits=folds)
    best_k = 1
    average_accuracy_for_best_k = 0
    
    for k in k_range:
        accuracy_sum = 0
        for train_index, test_index in kf.split(train_data):
            X_train, X_test = train_data[train_index], train_data[test_index]
            y_train, y_test = train_labels[train_index], train_labels[test_index]
            
            knn = KNearestNeighbor(X_train, y_train)
            validation_accuracy = classification_accuracy(knn, k, X_test, y_test)
            accuracy_sum += validation_accuracy
        
        average_accuracy = accuracy_sum/folds
        if (average_accuracy > average_accuracy_for_best_k):
            average_accuracy_for_best_k = average_accuracy
            best_k = k 
            
    return best_k, average_accuracy_for_best_k
Exemple #26
0
def computing_cv_accuracy_LDA(in_path=None, cv_n_fold=10):
    def u65(mod_Y):
        return 1.6 / mod_Y - 0.6 / mod_Y ** 2

    def u80(mod_Y):
        return 2.2 / mod_Y - 1.2 / mod_Y ** 2

    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

    data = export_data_set('iris.data') if in_path is None else pd.read_csv(in_path)
    print("-----DATA SET TRAINING---", in_path)
    X = data.iloc[:, :-1].values
    y = np.array(data.iloc[:, -1].tolist())
    kf = KFold(n_splits=cv_n_fold, random_state=None, shuffle=True)
    lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
    mean_u65, mean_u80 = 0, 0
    for idx_train, idx_test in kf.split(y):
        print("---k-FOLD-new-executing--")
        X_cv_train, y_cv_train = X[idx_train], y[idx_train]
        X_cv_test, y_cv_test = X[idx_test], y[idx_test]
        lda.fit(X_cv_train, y_cv_train)
        n_test = len(idx_test)
        sum_u65, sum_u80 = 0, 0
        for i, test in enumerate(X_cv_test):
            evaluate = lda.predict([test])
            print("-----TESTING-----", i)
            if y_cv_test[i] in evaluate:
                sum_u65 += u65(len(evaluate))
                sum_u80 += u80(len(evaluate))
        mean_u65 += sum_u65 / n_test
        mean_u80 += sum_u80 / n_test
    print("--->", mean_u65 / cv_n_fold, mean_u80 / cv_n_fold)
Exemple #27
0
def split_data(root_path, num_splits=4):
    mask_list = []
    for ext in ('*.mhd', '*.hdr', '*.nii'):
        mask_list.extend(sorted(glob(join(root_path,'masks',ext))))

    assert len(mask_list) != 0, 'Unable to find any files in {}'.format(join(root_path,'masks'))

    outdir = join(root_path,'split_lists')
    try:
        mkdir(outdir)
    except:
        pass

    kf = KFold(n_splits=num_splits)
    n = 0
    for train_index, test_index in kf.split(mask_list):
        with open(join(outdir,'train_split_' + str(n) + '.csv'), 'wb') as csvfile:
            writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            for i in train_index:
                writer.writerow([basename(mask_list[i])])
        with open(join(outdir,'test_split_' + str(n) + '.csv'), 'wb') as csvfile:
            writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            for i in test_index:
                writer.writerow([basename(mask_list[i])])
        n += 1
Exemple #28
0
 def compute_matrices_for_gradient_totalcverr(self, train_x, train_y, train_z):
     if self.kernelX_use_median:
         sigmax = self.kernelX.get_sigma_median_heuristic(train_x)
         self.kernelX.set_width(float(sigmax))
     if self.kernelY_use_median:
         sigmay = self.kernelY.get_sigma_median_heuristic(train_y)
         self.kernelY.set_width(float(sigmay))
     kf = KFold( n_splits=self.K_folds)
     matrix_results = [[[None] for _ in range(self.K_folds)]for _ in range(8)] 
     # xx=[[None]*10]*6 will give the same id to xx[0][0] and xx[1][0] etc. as 
     # this command simply copied [None] many times. But the above gives different ids.
     count = 0
     for train_index, test_index in kf.split(np.ones((self.num_samples,1))):
         X_tr, X_tst = train_x[train_index], train_x[test_index]
         Y_tr, Y_tst = train_y[train_index], train_y[test_index]
         Z_tr, Z_tst = train_z[train_index], train_z[test_index]
         matrix_results[0][count] = self.kernelX.kernel(X_tst, X_tr) #Kx_tst_tr
         matrix_results[1][count] = self.kernelX.kernel(X_tr, X_tr) #Kx_tr_tr
         matrix_results[2][count] = self.kernelX.kernel(X_tst, X_tst) #Kx_tst_tst
         matrix_results[3][count] = self.kernelY.kernel(Y_tst, Y_tr) #Ky_tst_tr
         matrix_results[4][count] = self.kernelY.kernel(Y_tr, Y_tr) #Ky_tr_tr
         matrix_results[5][count] = self.kernelY.kernel(Y_tst,Y_tst) #Ky_tst_tst
         matrix_results[6][count] = cdist(Z_tst, Z_tr, 'sqeuclidean') #D_tst_tr: square distance matrix
         matrix_results[7][count] = cdist(Z_tr, Z_tr, 'sqeuclidean') #D_tr_tr: square distance matrix
         count = count + 1
     return matrix_results
class TargetEncoderNSplits(BaseTransformer):
    def __init__(self, n_splits, **kwargs):
        self.k_folds = KFold(n_splits=n_splits)
        self.target_means_map = {}

    def _target_means_names(self, columns):
        confidence_rate_names = ['target_mean_{}'.format(column) for column in columns]
        return confidence_rate_names

    def _is_null_names(self, columns):
        is_null_names = ['target_mean_is_nan_{}'.format(column) for column in columns]
        return is_null_names

    def fit(self, categorical_features, target, **kwargs):
        feature_columns, target_column = categorical_features.columns, target.columns[0]

        X_target_means = []
        self.k_folds.get_n_splits(target)
        for train_index, test_index in self.k_folds.split(target):
            X_train, y_train = categorical_features.iloc[train_index], target.iloc[train_index]
            X_test, y_test = categorical_features.iloc[test_index], target.iloc[test_index]

            train = pd.concat([X_train, y_train], axis=1)
            for column, target_mean_name in zip(feature_columns, self._target_means_names(feature_columns)):
                group_object = train.groupby(column)
                train_target_means = group_object[target_column].mean(). \
                    reset_index().rename(index=str, columns={target_column: target_mean_name})

                X_test = X_test.merge(train_target_means, on=column, how='left')
            X_target_means.append(X_test)
        X_target_means = pd.concat(X_target_means, axis=0).astype(np.float32)

        for column, target_mean_name in zip(feature_columns, self._target_means_names(feature_columns)):
            group_object = X_target_means.groupby(column)
            self.target_means_map[column] = group_object[target_mean_name].mean().reset_index()

        return self

    def transform(self, categorical_features, **kwargs):
        columns = categorical_features.columns

        for column, target_mean_name, is_null_name in zip(columns,
                                                          self._target_means_names(columns),
                                                          self._is_null_names(columns)):
            categorical_features = categorical_features.merge(self.target_means_map[column],
                                                              on=column,
                                                              how='left').astype(np.float32)
            categorical_features[is_null_name] = pd.isnull(categorical_features[target_mean_name]).astype(int)
            categorical_features[target_mean_name].fillna(0, inplace=True)

        return {'numerical_features': categorical_features[self._target_means_names(columns)],
                'categorical_features': categorical_features[self._is_null_names(columns)]}

    def load(self, filepath):
        self.target_means_map = joblib.load(filepath)
        return self

    def save(self, filepath):
        joblib.dump(self.target_means_map, filepath)
def regulCV(X,y,n_splits = 10):

    kf = KFold(n_splits=n_splits)

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        yield X_train, y_train, X_test, y_test
Exemple #31
0
 def _inner(o, **kwargs):
     if stratify:
         _, unique_counts = np.unique(o, return_counts=True)
         if np.min(unique_counts) >= 2 and np.min(unique_counts) >= n_splits: stratify_ = stratify
         elif np.min(unique_counts) < n_splits:
             stratify_ = False
             pv(f'stratify set to False as n_splits={n_splits} cannot be greater than the min number of members in each class ({np.min(unique_counts)}).',
                verbose)
         else:
             stratify_ = False
             pv('stratify set to False as the least populated class in o has only 1 member, which is too few.', verbose)
     else: stratify_ = False
     vs = 0 if train_only else 1. / n_splits if n_splits > 1 else int(valid_size * len(o)) if isinstance(valid_size, float) else valid_size
     if test_size:
         ts = int(test_size * len(o)) if isinstance(test_size, float) else test_size
         train_valid, test = train_test_split(range(len(o)), test_size=ts, stratify=o if stratify_ else None, shuffle=shuffle,
                                              random_state=random_state, **kwargs)
         test = toL(test)
         if shuffle: test = random_shuffle(test, random_state)
         if vs == 0:
             train, _ = RandomSplitter(0, seed=random_state)(o[train_valid])
             train = toL(train)
             if balance: train = train[balance_idx(o[train], random_state=random_state)]
             if shuffle: train = random_shuffle(train, random_state)
             train_ = L(L([train]) * n_splits) if n_splits > 1 else train
             valid_ = L(L([train]) * n_splits) if n_splits > 1 else train
             test_ = L(L([test]) * n_splits) if n_splits > 1 else test
             if n_splits > 1:
                 return [split for split in itemify(train_, valid_, test_)]
             else:
                 return train_, valid_, test_
         elif n_splits > 1:
             if stratify_:
                 splits = StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state).split(np.arange(len(train_valid)), o[train_valid])
             else:
                 splits = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state).split(np.arange(len(train_valid)))
             train_, valid_ = L([]), L([])
             for train, valid in splits:
                 train, valid = toL(train), toL(valid)
                 if balance: train = train[balance_idx(o[train], random_state=random_state)]
                 if shuffle:
                     train = random_shuffle(train, random_state)
                     valid = random_shuffle(valid, random_state)
                 train_.append(L(L(train_valid)[train]))
                 valid_.append(L(L(train_valid)[valid]))
             test_ = L(L([test]) * n_splits)
             return [split for split in itemify(train_, valid_, test_)]
         else:
             train, valid = train_test_split(range(len(train_valid)), test_size=vs, random_state=random_state,
                                             stratify=o[train_valid] if stratify_ else None, shuffle=shuffle, **kwargs)
             train, valid = toL(train), toL(valid)
             if balance: train = train[balance_idx(o[train], random_state=random_state)]
             if shuffle:
                 train = random_shuffle(train, random_state)
                 valid = random_shuffle(valid, random_state)
             return (L(L(train_valid)[train]), L(L(train_valid)[valid]),  test)
     else:
         if vs == 0:
             train, _ = RandomSplitter(0, seed=random_state)(o)
             train = toL(train)
             if balance: train = train[balance_idx(o[train], random_state=random_state)]
             if shuffle: train = random_shuffle(train, random_state)
             train_ = L(L([train]) * n_splits) if n_splits > 1 else train
             valid_ = L(L([train]) * n_splits) if n_splits > 1 else train
             if n_splits > 1:
                 return [split for split in itemify(train_, valid_)]
             else:
                 return (train_, valid_)
         elif n_splits > 1:
             if stratify_: splits = StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state).split(np.arange(len(o)), o)
             else: splits = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state).split(np.arange(len(o)))
             train_, valid_ = L([]), L([])
             for train, valid in splits:
                 train, valid = toL(train), toL(valid)
                 if balance: train = train[balance_idx(o[train], random_state=random_state)]
                 if shuffle:
                     train = random_shuffle(train, random_state)
                     valid = random_shuffle(valid, random_state)
                 if not isinstance(train, (list, L)):  train = train.tolist()
                 if not isinstance(valid, (list, L)):  valid = valid.tolist()
                 train_.append(L(train))
                 valid_.append(L(L(valid)))
             return [split for split in itemify(train_, valid_)]
         else:
             train, valid = train_test_split(range(len(o)), test_size=vs, random_state=random_state, stratify=o if stratify_ else None,
                                             shuffle=shuffle, **kwargs)
             train, valid = toL(train), toL(valid)
             if balance: train = train[balance_idx(o[train], random_state=random_state)]
             return train, valid
Exemple #32
0
def train_classifiers(train_vecs, train_labels, typ='bow'):
    X = np.array(train_vecs)
    y = np.array(train_labels)
    
    kf = KFold(5, shuffle=True, random_state=42)
    cv_rf_f1, cv_lrsgd_f1, cv_svcsgd_f1,  = [], [], []
    cv_rf_ac, cv_lrsgd_ac, cv_svcsgd_ac,  = [], [], []
    y_pred_sgd, y_pred_sgh, y_pred_rf, = [], [], []
    
    for train_ind, val_ind in kf.split(X, y):
        # Assign CV IDX
        X_train, y_train = X[train_ind], y[train_ind]
        X_val, y_val = X[val_ind], y[val_ind]
        
        # Scale Data
        scaler = StandardScaler()
        X_train_scale = scaler.fit_transform(X_train)
        X_val_scale = scaler.transform(X_val)
    
        # Logisitic Regression
#        lr = LogisticRegression(
#            max_iter=4000,
#            class_weight= 'balanced',
#            solver='newton-cg',
#            fit_intercept=True
#        ).fit(X_train_scale, y_train)
#    
#        y_pred = lr.predict(X_val_scale)
#        cv_lr_f1.append(f1_score(y_val, y_pred, average='weighted'))
        
        # Logistic Regression SGD
        sgd = linear_model.SGDClassifier(
            max_iter=1000,
            tol=1e-3,
            loss='log',
            class_weight='balanced'
        ).fit(X_train_scale, y_train)
        
        y_pred_sgd.append(sgd.predict(X_val_scale))
        cv_lrsgd_f1.append(f1_score(y_val, y_pred_sgd[-1], average='macro'))
        cv_lrsgd_ac.append(accuracy_score(y_val, y_pred_sgd[-1]))
        
        # SGD Modified Huber
        sgd_huber = linear_model.SGDClassifier(
            max_iter=1000,
            tol=1e-3,
            alpha=20,
            loss='modified_huber',
            class_weight='balanced'
        ).fit(X_train_scale, y_train)
        
        y_pred_sgh.append(sgd_huber.predict(X_val_scale))
        cv_svcsgd_f1.append(f1_score(y_val, y_pred_sgh[-1], average='macro'))
        cv_svcsgd_ac.append(accuracy_score(y_val, y_pred_sgh[-1]))
        
        # Random Forest
        rf = RandomForestClassifier(
            class_weight='balanced'
        ).fit(X_train_scale, y_train)
        
        y_pred_rf.append(rf.predict(X_val_scale))
        cv_rf_f1.append(f1_score(y_val, y_pred_rf[-1], average='macro'))
        cv_rf_ac.append(accuracy_score(y_val, y_pred_rf[-1]))
        
    y_pred_sgd_final = [item for sublist in y_pred_sgd for item in sublist]
    y_pred_sgh_final = [item for sublist in y_pred_sgh for item in sublist]
    y_pred_rf_final = [item for sublist in y_pred_rf for item in sublist]
    
#    print(f'Logistic Regression Val f1: {np.mean(cv_lr_f1):.3f} +- {np.std(cv_lr_f1):.3f}')
    print(f'SGD Val f1: {np.mean(cv_lrsgd_f1):.3f} +- {np.std(cv_lrsgd_f1):.3f}',typ)
    print(f'SVM Huber Val f1: {np.mean(cv_svcsgd_f1):.3f} +- {np.std(cv_svcsgd_f1):.3f}',typ)
    print(f'Random Forest Val f1: {np.mean(cv_rf_f1):.3f} +- {np.std(cv_rf_f1):.3f}',typ)
    print("\n")
    print(f'SGD Val acc: {np.mean(cv_lrsgd_ac):.3f} +- {np.std(cv_lrsgd_ac):.3f}',typ)
    print(f'SVM Huber Val acc: {np.mean(cv_svcsgd_ac):.3f} +- {np.std(cv_svcsgd_ac):.3f}',typ)
    print(f'Random Forest Val acc: {np.mean(cv_rf_ac):.3f} +- {np.std(cv_rf_ac):.3f}',typ)
    print("\n")
    print("Precision (micro) SGD: %f" % precision_score(y, y_pred_sgd_final, average='micro'),typ)
    print("Recall (micro) SGD:    %f" % recall_score(y, y_pred_sgd_final, average='micro'),typ)
    print("F1 score (micro) SGD:  %f" % f1_score(y, y_pred_sgd_final, average='micro'),typ, end='\n\n')
    print("Precision (macro) SGD: %f" % precision_score(y, y_pred_sgd_final, average='macro'),typ)
    print("Recall (macro) SGD:    %f" % recall_score(y, y_pred_sgd_final, average='macro'),typ)
    print("F1 score (macro) SGD:  %f" % f1_score(y, y_pred_sgd_final, average='macro'),typ, end='\n\n')
    print("Precision (weighted) SGD: %f" % precision_score(y, y_pred_sgd_final, average='weighted'),typ)
    print("Recall (weighted) SGD:    %f" % recall_score(y, y_pred_sgd_final, average='weighted'),typ)
    print("F1 score (weighted) SGD:  %f" % f1_score(y, y_pred_sgd_final, average='weighted'),typ)
    print("\n")
    print("Precision (micro) SVM Huber: %f" % precision_score(y, y_pred_sgh_final, average='micro'),typ)
    print("Recall (micro) SVM Huber:    %f" % recall_score(y, y_pred_sgh_final, average='micro'),typ)
    print("F1 score (micro) SVM Huber:  %f" % f1_score(y, y_pred_sgh_final, average='micro'),typ, end='\n\n')
    print("Precision (macro) SVM Huber: %f" % precision_score(y, y_pred_sgh_final, average='macro'),typ)
    print("Recall (macro) SVM Huber:    %f" % recall_score(y, y_pred_sgh_final, average='macro'),typ)
    print("F1 score (macro) SVM Huber:  %f" % f1_score(y, y_pred_sgh_final, average='macro'),typ, end='\n\n')
    print("Precision (weighted) SVM Huber: %f" % precision_score(y, y_pred_sgh_final, average='weighted'),typ)
    print("Recall (weighted) SVM Huber:    %f" % recall_score(y, y_pred_sgh_final, average='weighted'),typ)
    print("F1 score (weighted) SVM Huber:  %f" % f1_score(y, y_pred_sgh_final, average='weighted'),typ)
    print("\n")
    print("Precision (micro) RF: %f" % precision_score(y, y_pred_rf_final, average='micro'),typ)
    print("Recall (micro) RF:    %f" % recall_score(y, y_pred_rf_final, average='micro'),typ)
    print("F1 score (micro) RF:  %f" % f1_score(y, y_pred_rf_final, average='micro'),typ, end='\n\n')
    print("Precision (macro) RF: %f" % precision_score(y, y_pred_rf_final, average='macro'),typ)
    print("Recall (macro) RF:    %f" % recall_score(y, y_pred_rf_final, average='macro'),typ)
    print("F1 score (macro) RF:  %f" % f1_score(y, y_pred_rf_final, average='macro'),typ, end='\n\n')
    print("Precision (weighted) RF: %f" % precision_score(y, y_pred_rf_final, average='weighted'),typ)
    print("Recall (weighted) RF:    %f" % recall_score(y, y_pred_rf_final, average='weighted'),typ)
    print("F1 score (weighted) RF:  %f" % f1_score(y, y_pred_rf_final, average='weighted'),typ)
    return [sgd, sgd_huber, rf]
# Prepare data
# ===============================================================================================

# Load dataset
inputData = np.loadtxt(open('DATASET.csv'), delimiter=",", skiprows=1, dtype='float')
# Atributes except 'Class' column
X = inputData[:, :-1]
# Class labels
y = inputData[:, -1]

# Standarize data
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

kf = KFold(n_splits=2)

# Split in train and test set
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

# ===============================================================================================
# KNN
# ===============================================================================================

# Moment at we start building the model
time_ini_knn = time()
# Apply 5NN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
import os
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression, Ridge
from src.utils.CONSTANTS import CITY_ID_COL, NEIGHBORHOOD_ID_COL
from config import UNIFIED_FORMS_FILE, PROCESSED_DATA_DIR
from src.train.constants import TIME_COL, DATE_COL, PRED_COL, GT_COL


AGGREGATED_DIR = os.path.join(PROCESSED_DATA_DIR,'aggregated')

N_splits = 3
kfold = KFold(n_splits=N_splits, random_state=1, shuffle=True)

MINIMUM_PER_REGION = 50
x_agg_mode = {'mode': 'range', 'n_days': 3, 'min_per_region': 50}
y_agg_mode = {'mode': 'range', 'n_days': 2, 'min_per_region': 15}

city_type = 'city'
neighborhood_type = 'neighbor'

lower_cut_date = '2020-03-21'
upper_cut_date = '2020-04-05'

x_train_date = '2020-03-26'
y_train_date = '2020-03-30'

x_test_date = '2020-03-30'
y_test_date = '2020-04-03'

y_col_name = 'confirmed_cases'
save_map = False
Exemple #35
0
import warnings
warnings.filterwarnings('ignore')
import pandas as pd

# 1. 데이터

dataset = pd.read_csv('../data/csv/iris_sklearn.csv', header=0, index_col=0)
x = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=32)

kfold = KFold(n_splits=5, shuffle=True)

parameters = [{
    "C": [1, 10, 100, 1000],
    "kernel": ["linear"]
}, {
    "C": [1, 10, 100],
    "kernel": ["rbf"],
    "gamma": [0.001, 0.0001]
}, {
    "C": [1, 10, 100, 1000],
    "kernel": ["sigmoid"],
    "gamma": [0.001, 0.0001]
}]

# 2. 모델 구성
def compare_estimators(estimators: list,
                       datasets,
                       metrics: list,
                       n_cv_folds=10,
                       decimals=3,
                       cellsize=22,
                       verbose=True):
    if type(estimators) != list:
        raise Exception(
            "First argument needs to be a list of tuples containing ('name', Estimator pairs)"
        )
    if type(metrics) != list:
        raise Exception(
            "Argument metrics needs to be a list of tuples containing ('name', scoring function pairs)"
        )

    mean_results = {d[0]: [] for d in datasets}
    std_results = {d[0]: [] for d in datasets}

    # loop over datasets
    for d in tqdm(datasets):
        if verbose:
            print("comparing on dataset", d[0])
        mean_result = []
        std_result = []
        X, y = get_dataset(d[1])

        # loop over estimators
        for (est_name, est) in estimators:
            mresults = [[] for i in range(len(metrics))]

            # loop over folds
            kf = KFold(n_splits=n_cv_folds)
            for train_idx, test_idx in kf.split(X):
                start = time.time()
                est.fit(X[train_idx, :], y[train_idx])
                y_pred = est.predict(X[test_idx, :])
                end = time.time()

                # loop over metrics
                for i, (met_name, met) in enumerate(metrics):
                    if met_name == 'Time':
                        mresults[i].append(end - start)
                    elif met_name == 'Complexity':
                        if est_name != 'MLPClassifier (sklearn)':
                            mresults[i].append(get_complexity(est))
                    else:
                        try:
                            mresults[i].append(met(y[test_idx], y_pred))
                        except:
                            mresults[i].append(
                                met(to_numeric(y[test_idx]),
                                    to_numeric(y_pred)))

            for i in range(len(mresults)):
                mean_result.append(np.mean(mresults[i]))
                std_result.append(np.std(mresults[i]) / n_cv_folds)

        mean_results[d[0]] = mean_result
        std_results[d[0]] = std_result

    return mean_results, std_results
def dcv_rgr(X, y, model, param_grid, niter):
    """
    Double cross validation (regression)

    Parameters
    ----------
    X : array-like, shape = [n_samples, n_features]
        X training+test data

    y : array-like, shape = [n_samples]
        y training+test data

    model: 
        machine learning model (scikit-learn)

    param_grid : dict or list of dictionaries
        Dictionary with parameters names (string) as keys and lists of
        parameter settings to try as values, or a list of such dictionaries,
        in which case the grids spanned by each dictionary in the list are
        explored.

    niter : int
        number of DCV iteration

    Returns
    -------
    None
    """
    # parameters
    ns_in = 3  # n_splits for inner loop
    ns_ou = 3  # n_splits for outer loop
    scores = np.zeros((niter, 3))
    for iiter in range(niter):
        ypreds = np.array([])  # list of predicted y in outer loop
        ytests = np.array([])  # list of y_test in outer loop
        kf_ou = KFold(n_splits=ns_ou, shuffle=True)

        # [start] outer loop for test of the generalization error
        for train_index, test_index in kf_ou.split(X):
            X_train, X_test = X[train_index], X[test_index]  # inner loop CV
            y_train, y_test = y[train_index], y[test_index]  # outer loop

            # [start] inner loop CV for hyper parameter optimization
            kf_in = KFold(n_splits=ns_in, shuffle=True)
            gscv = GridSearchCV(model, param_grid, cv=kf_in)
            gscv.fit(X_train, y_train)
            # [end] inner loop CV for hyper parameter optimization

            # test of the generalization error
            ypred = gscv.predict(X_test)
            ypreds = np.append(ypreds, ypred)
            ytests = np.append(ytests, y_test)

        # [end] outer loop for test of the generalization error
        rmse = np.sqrt(mean_squared_error(ytests, ypreds))
        mae = mean_absolute_error(ytests, ypreds)
        r2 = r2_score(ytests, ypreds)
        #        print('DCV:RMSE, MAE, R^2 = {:.3f}, {:.3f}, {:.3f}'\
        #        .format(rmse, mae, r2))
        scores[iiter, :] = np.array([rmse, mae, r2])

    means, stds = np.mean(scores, axis=0), np.std(scores, axis=0)
    print()
    print('Double Cross Validation')
    print('In {:} iterations, average +/- standard deviation'.format(niter))
    print('RMSE DCV: {:.3f} (+/-{:.3f})'.format(means[0], stds[0]))
    print('MAE  DCV: {:.3f} (+/-{:.3f})'.format(means[1], stds[1]))
    print('R^2  DCV: {:.3f} (+/-{:.3f})'.format(means[2], stds[2]))
    print('DCV:RMSE, MAE, R^2 = {:.3f}, {:.3f}, {:.3f} (ave)'\
          .format(means[0], means[1], means[2]))
    print('DCV:RMSE, MAE, R^2 = {:.3f}, {:.3f}, {:.3f} (std)'\
          .format(stds[0], stds[1], stds[2]))
Exemple #38
0
    def evaluate(self, ind, **kwargs):
        # ind.phenotype will be a string, including function definitions etc.
        # When we exec it, it will create a value XXX_output_XXX, but we exec
        # inside an empty dict for safety.

        p, d = ind.phenotype, {}

        genome, output, invalid, max_depth, nodes = ind.tree.get_tree_info(
            params['BNF_GRAMMAR'].non_terminals.keys(), [], [])
        Logger.log("Depth: {0}\tGenome: {1}".format(max_depth, genome))

        # Exec the phenotype.
        X_test, y_test = self.X_test, self.y_test
        image_size = X_test[0].shape
        flat_ind, kernel_size = NetworkProcessor.process_network(
            ind, image_size)
        Logger.log("Individual: {}".format(flat_ind))
        Logger.log("New kernel size: {}".format(kernel_size))

        new_conv_layers = []
        for i, k in enumerate(self.conv_layers):
            new_conv_layers.append((k[0], kernel_size[i], k[2], k[3], k[4]))

        train_loss = stats('mse')
        test_loss = stats('accuracy')
        kf = KFold(n_splits=params['CROSS_VALIDATION_SPLIT'])
        net = ClassificationNet(self.fcn_layers, new_conv_layers)
        fitness, fold = 0, 1

        Logger.log("Training Start: ")

        # Cross validation
        s_time = np.empty((kf.get_n_splits()))
        validation_acc = np.empty((kf.get_n_splits()))
        test_acc = np.empty((kf.get_n_splits()))
        for train_index, val_index in kf.split(self.X_train):
            X_train, X_val = self.X_train[train_index], self.X_train[val_index]
            y_train, y_val = self.y_train[train_index], self.y_train[val_index]
            data_train = DataIterator(X_train, y_train, params['BATCH_SIZE'])
            early_ckpt, early_stop, early_crit, epsilon = 20, [], params[
                'EARLY_STOP_FREQ'], params['EARLY_STOP_EPSILON']
            s_time[fold - 1] = time.time()

            # Train model
            net.model.reinitialize_params()
            for epoch in range(1, params['NUM_EPOCHS'] + 1):
                # mini-batch training
                for x, y in data_train:
                    net.train(epoch, x, y, train_loss)

                # log training loss
                if epoch % params['TRAIN_FREQ'] == 0:
                    Logger.log("Epoch {} Training loss (NLL): {:.6f}".format(
                        epoch, train_loss.getLoss('mse')))

                # log validation/test loss
                if epoch % params['VALIDATION_FREQ'] == 0:
                    net.test(X_val, y_val, test_loss)
                    Logger.log(
                        "Epoch {} Validation loss (NLL/Accuracy): {:.6f} {:.6f}"
                        .format(epoch, test_loss.getLoss('mse'),
                                test_loss.getLoss('accuracy')))
                    net.test(X_test, y_test, test_loss)
                    Logger.log(
                        "Epoch {} Test loss (NLL/Accuracy): {:.6f} {:.6f}".
                        format(epoch, test_loss.getLoss('mse'),
                               test_loss.getLoss('accuracy')))

                # check for early stop
                if epoch == early_ckpt:
                    accuracy = net.test(X_test,
                                        y_test,
                                        test_loss,
                                        print_confusion=True)
                    early_stop.append(accuracy)
                    if len(early_stop) > 3:
                        latest_acc = early_stop[-early_crit:]
                        latest_acc = np.subtract(latest_acc,
                                                 latest_acc[1:] + [0])
                        if (abs(latest_acc[:-1]) < epsilon).all() == True:
                            Logger.log(
                                "Early stopping at epoch {} (latest {} ckpts): {}"
                                .format(
                                    epoch, early_crit, " ".join([
                                        "{:.4f}".format(x)
                                        for x in early_stop[-early_crit:]
                                    ])))
                            break
                    early_ckpt = min(early_ckpt + 300, early_ckpt * 2)

            # Validate model
            net.test(X_val, y_val, test_loss)
            validation_acc[fold - 1] = test_loss.getLoss('accuracy')
            Logger.log(
                "Cross Validation [Fold {}/{}] Validation (NLL/Accuracy): {:.6f} {:.6f}"
                .format(fold, kf.get_n_splits(), test_loss.getLoss('mse'),
                        test_loss.getLoss('accuracy')))

            # Test model
            net.test(X_test, y_test, test_loss)
            test_acc[fold - 1] = test_loss.getLoss('accuracy')
            Logger.log(
                "Cross Validation [Fold {}/{}] Test (NLL/Accuracy): {:.6f} {:.6f}"
                .format(fold, kf.get_n_splits(), test_loss.getLoss('mse'),
                        test_loss.getLoss('accuracy')))

            # Calculate time
            s_time[fold - 1] = time.time() - s_time[fold - 1]
            Logger.log(
                "Cross Validation [Fold {}/{}] Training Time (m / m per epoch): {:.3f} {:.3f}"
                .format(fold, kf.get_n_splits(), s_time[fold - 1] / 60,
                        s_time[fold - 1] / 60 / epoch))

            fold = fold + 1

        fitness = validation_acc.mean()

        for i in range(0, kf.get_n_splits()):
            Logger.log(
                "STAT -- Model[{}/{}] #{:.3f}m Validation / Generalization accuracy (%): {:.4f} {:.4f}"
                .format(i, kf.get_n_splits(), s_time[i] / 60,
                        validation_acc[i] * 100, test_acc[i] * 100))
        Logger.log(
            "STAT -- Mean Validation / Generatlization accuracy (%): {:.4f} {:.4f}"
            .format(validation_acc.mean() * 100,
                    test_acc.mean() * 100))
        # ind.net = net
        params['CURRENT_EVALUATION'] += 1
        return fitness
Exemple #39
0
        # pkl_file = open('data.pkl', 'rb')
        # [names, base, series, labels] = pickle.load(pkl_file)
        # pkl_file.close()

        #样本个数
        N_Samples = 3050
        base_dim = 19
        series_dims = [27 - rounds, 27 - rounds, 27 - rounds]

        accuracy_list, auc_list = [], []

        name_list = []
        score_list = []
        label_list = []

        kf = KFold(N_Samples, n_folds=5)
        for train, test in kf:
            #基本型变量
            x_train_base = base[train]
            x_test_base = base[test]

            #序列型变量
            x_train_series = [
                series[i][train] for i in xrange(0, len(series_dims))
            ]
            x_test_series = [
                series[i][test] for i in xrange(0, len(series_dims))
            ]

            #标签
            y_train = labels[train]
Exemple #40
0
sampler = RandomUnderSampler(random_state=42)
X, Y = sampler.fit_resample(X, Y)
print(np.sum(Y == 1), np.sum(Y == 0))

# 特徴量を5つ選択
selector = SelectKBest(k=5)
selector.fit(X, Y)
mask = selector.get_support()
# どの変数を選択したかを確認
print(bank_df.drop('y', axis=1).columns)
print(mask)

# kFoldを使って交差検証
# 一つ目の引数はデータセットを分割する個数
# 二つ目の引数はデータセットをシャッフルするよう指定
kf = KFold(n_splits=18, shuffle=True)
scores = []

# 訓練データとテストデータの組み合わせを変えながら、モデルを作成し、精度を確認していきます。
for train_id, test_id in kf.split(X):
    # 訓練データを抽出
    x = X[train_id]
    y = Y[train_id]
    # 分類のための決定木インスタンスclfを生成します。
    cif = tree.DecisionTreeClassifier()
    # 訓練データを使って決定木モデルを作成
    # モデル作成には、デフォルトのパラメータをそのまま使用します。
    cif.fit(x, y)
    # predictを使って作成したモデルにテストデータを適用し出力を得ます
    pred_y = cif.predict(X[test_id])
    # accuracy_scoreを使って、出力と正解の正誤数からモデルの精度を計算します
Exemple #41
0
''' Using apex for faster training
optimizer_list = []
for i in range(10):
    optimizer_list.append(AdamW(model.parameters(), lr=3e-5, correct_bias=False))

model = amp.initialize(model, opt_level="O2", verbosity=0)
''' 

''' Save origin state dict of Model and Optimizer'''
torch.save(model.state_dict(), 'origin_sd.pth')
origin_sd = torch.load('origin_sd.pth')


# Training with K-fold
new_data = data.sample(frac=1).reset_index(drop=True)
kf = KFold(2)
BATCH_SIZE = 7
EPOCH = 5
LEARNING_RATE = 2e-5


last_predict = []
i = 0
for train_idx, test_idx in tqdm(kf.split(new_data)):
    train_data = new_data.iloc[train_idx]
    test_data = new_data.iloc[test_idx]
    
    print(model.load_state_dict(origin_sd))
    
    ''' Get optimizer for each KFold
    optimizer = optimizer_list[i]
def validate():
    """
    run KFOLD method for regression 
    """
    #defining directories    
    dir_in = "/lustre/fs0/home/mtadesse/merraAllLagged"
    dir_out = "/lustre/fs0/home/mtadesse/merraLRValidation"
    surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef"

    
    #cd to the lagged predictors directory
    os.chdir(dir_in)
    
    
    x = 475
    y = 476
    
    #empty dataframe for model validation
    df = pd.DataFrame(columns = ['tg', 'lon', 'lat', 'num_year', \
                                 'num_95pcs','corrn', 'rmse'])
    
    #looping through 
    for tg in range(x,y):
        
        os.chdir(dir_in)

        tg_name = os.listdir()[tg]
        print(tg, tg_name)
        
        ##########################################
        #check if this tg is already taken care of
        ##########################################
        os.chdir(dir_out)
        if os.path.isfile(tg_name):
            return "file already analyzed!"
        
        
        os.chdir(dir_in)

        #load predictor
        pred = pd.read_csv(tg_name)
        pred.drop('Unnamed: 0', axis = 1, inplace = True)
        
        #add squared and cubed wind terms (as in WPI model)
        pickTerms = lambda x: x.startswith('wnd')
        wndTerms = pred.columns[list(map(pickTerms, pred.columns))]
        wnd_sqr = pred[wndTerms]**2
        wnd_cbd = pred[wndTerms]**3
        pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis = 1)

        #standardize predictor data
        dat = pred.iloc[:,1:]
        scaler = StandardScaler()
        print(scaler.fit(dat))
        dat_standardized = pd.DataFrame(scaler.transform(dat), \
                                        columns = dat.columns)
        pred_standardized = pd.concat([pred['date'], dat_standardized], axis = 1)
        
    
        #load surge data
        os.chdir(surge_path)
        surge = pd.read_csv(tg_name)
        surge.drop('Unnamed: 0', axis = 1, inplace = True)
        
        #remove duplicated surge rows
        surge.drop(surge[surge['ymd'].duplicated()].index, axis = 0, inplace = True)
        surge.reset_index(inplace = True)
        surge.drop('index', axis = 1, inplace = True)
        
        
        #adjust surge time format to match that of pred
        time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d'))
        surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])), columns = ['date'])
        time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
        surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]], axis = 1)
    
        #merge predictors and surge to find common time frame
        pred_surge = pd.merge(pred_standardized, surge_new.iloc[:,:2], on='date', how='right')
        pred_surge.sort_values(by = 'date', inplace = True)
        
        #find rows that have nans and remove them
        row_nan = pred_surge[pred_surge.isna().any(axis =1)]
        pred_surge.drop(row_nan.index, axis = 0, inplace = True)
        pred_surge.reset_index(inplace = True)
        pred_surge.drop('index', axis = 1, inplace = True)
        
        
        #in case pred and surge don't overlap
        if pred_surge.shape[0] == 0:
            print('-'*80)
            print('Predictors and Surge don''t overlap')
            print('-'*80)
            continue
        
     
        pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \
                                                   pred_surge['date'])), \
                                          columns = ['date'])
        
        #prepare data for training/testing
        X = pred_surge.iloc[:,1:-1]
        y = pd.DataFrame(pred_surge['surge'])
        y = y.reset_index()
        y.drop(['index'], axis = 1, inplace = True)
        
        #apply PCA
        pca = PCA(.95)
        pca.fit(X)
        X_pca = pca.transform(X)
        
        #apply 10 fold cross validation
        kf = KFold(n_splits=10, random_state=29)
        
        metric_corr = []; metric_rmse = []; #combo = pd.DataFrame(columns = ['pred', 'obs'])
        for train_index, test_index in kf.split(X):
            X_train, X_test = X_pca[train_index], X_pca[test_index]
            y_train, y_test = y['surge'][train_index], y['surge'][test_index]
            
            #train regression model
            lm = LinearRegression()
            lm.fit(X_train, y_train)
            
            #predictions
            predictions = lm.predict(X_test)
            # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \
            #                       pd.DataFrame(np.array(y_test))], \
            #                      axis = 1)
            # pred_obs.columns = ['pred', 'obs']
            # combo = pd.concat([combo, pred_obs], axis = 0)    
            
            #evaluation matrix - check p value
            if stats.pearsonr(y_test, predictions)[1] >= 0.05:
                print("insignificant correlation!")
                continue
            else:
                print(stats.pearsonr(y_test, predictions))
                metric_corr.append(stats.pearsonr(y_test, predictions)[0])
                print(np.sqrt(metrics.mean_squared_error(y_test, predictions)))
                metric_rmse.append(np.sqrt(metrics.mean_squared_error(y_test, predictions)))
            
        
        #number of years used to train/test model
        num_years = (pred_surge['date'][pred_surge.shape[0]-1] -\
                             pred_surge['date'][0]).days/365
        longitude = surge['lon'][0]
        latitude = surge['lat'][0]
        num_pc = X_pca.shape[1] #number of principal components
        corr = np.mean(metric_corr)
        rmse = np.mean(metric_rmse)
        
        print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',np.mean(metric_corr), ' -  avg_rmse (m) = ', \
              np.mean(metric_rmse), '\n')
        
        #original size and pca size of matrix added
        new_df = pd.DataFrame([tg_name, longitude, latitude, num_years, num_pc, corr, rmse]).T
        new_df.columns = ['tg', 'lon', 'lat', 'num_year', \
                                 'num_95pcs','corrn', 'rmse']
        df = pd.concat([df, new_df], axis = 0)
        
        
        #save df as cs - in case of interruption
        os.chdir(dir_out)
        df.to_csv(tg_name)
        
        #cd to dir_in
        os.chdir(dir_in)
Exemple #43
0
# ### Classifier 1 : Decision Trees with Pruning
# Steps in the process:
# * Create the learning curves using test data and cross-validation (unpruned and non-optimized)
# * Create Validation curves on 2 hyper parameters to find the best hyper parameter value
# * Recreate the Learning Curve with the correct values of the hyper parameters
# * After tuning the classifier using the two found hyper-parameters, use the classifier to predict the results and collect metrics

# In[4]:

scorer = make_scorer(accuracy_score)

# In[5]:

print("Decision Tree: Create Learning Curves")
dtree_classifier = DecisionTreeClassifier()
cv = KFold(n_splits=5, shuffle=True)
dt_lc1_train_sizes, dt_lc1_train_scores, dt_lc1_validation_scores = learning_curve(
    dtree_classifier,
    X_train,
    Y_train,
    train_sizes=np.linspace(0.05, 1.0, 20),
    cv=cv,
    scoring=scorer,
    n_jobs=4)
print("Decision Tree: Done with learning curve")

# In[6]:

plot_learning_curve(
    dt_lc1_train_sizes, dt_lc1_train_scores, dt_lc1_validation_scores,
    "Figure 1.1.1: Decision Tree Learning Curve (Unpruned) \n (Census Income Data)"
Exemple #44
0
 def _splitter(self):
     return KFold(n_splits=self._folds.value,
                  shuffle=self._shuffle.value,
                  random_state=self._randomSeed.value).split
Exemple #45
0
cantidadDeParametros = data.shape[1]-1

#Etiquetas
y = data[:,0]
x = []

print(y)
# print(data[0][1::])
for i in range(0, cantidadDeDatos):
  x.append(data[i][1:])

x = np.array(x)



kf = KFold(n_splits = 5, shuffle=True)

for i in range(1,11):
  print("=================== Medición con K = ",i)

  clf = KNeighborsClassifier(n_neighbors=i)

  accp = 0

  for train_index, test_index in kf.split(x):
    x_train = x[train_index, :]
    y_train = y[train_index]
    clf.fit(x_train, y_train)

    x_test = x[test_index, :]
    y_test = y[test_index]
Exemple #46
0
#X['modularity'] = dados['modularity']
#X['global_average_link_distance'] = dados['global_average_link_distance']
#X['eigenvector'] = dados['eigenvector']
X['coreness'] = dados['coreness']
X['transitivity'] = dados['transitivity']
#X['average_path_length'] = dados['average_path_length']
#X['eccentricity'] = dados['eccentricity']
#X['pagerank'] = dados['pagerank']
#X['grauMedio'] = dados['grauMedio']
X['links'] = dados['links']

Y = np.asarray(dados['flag'])
X = np.asarray(X)

kf = KFold(n_splits=10)  #divide o dataset em 10 partes 9 p/ treino e 1 teste

a = 0
f = 0
p = 0
r = 0
i = 0

for train_index, test_index in kf.split(X):
    i += 1
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]

    clf = clf = RandomForestClassifier(n_estimators=100,
                                       max_depth=2,
                                       random_state=0).fit(X_train, y_train)
dat_overlap_vars.hist()

dat_separate_vars = dat_separate_vars.dropna()

dat_separate_vars = dat_separate_vars[(np.abs(stats.zscore(dat_separate_vars))
                                       < float(std_dev)).all(axis=1)]
dat_separate_vars.hist()

# %% try some linear regression

X_sep = dat_separate_vars.drop('distance', axis=1)
y_sep = dat_separate_vars['distance']

model = LinearRegression()
scores_sep = []
kfold = KFold(n_splits=10, shuffle=True, random_state=123)
for i, (train, test) in enumerate(kfold.split(X_sep, y_sep)):
    model.fit(X_sep.iloc[train, :], y_sep.iloc[train])
    scores_sep.append(model.score(X_sep.iloc[test, :], y_sep.iloc[test]))
print(np.mean(scores_sep))

X_over = dat_overlap_vars.drop('distance', axis=1)
y_over = dat_overlap_vars['distance']
scores_over = []
for i, (train, test) in enumerate(kfold.split(X_over, y_over)):
    model.fit(X_over.iloc[train, :], y_over.iloc[train])
    scores_over.append(model.score(X_over.iloc[test, :], y_over.iloc[test]))
print(np.mean(scores_over))

# %% different methods of cross validation:
model_norm = LinearRegression(normalize=True)
Exemple #48
0
    def cross_val_score(X, y, model_creation_fun, save_dir, n_folds=4):

        kfold = KFold(n_splits=n_folds)
        fold_liks = np.empty(n_folds)

        for i, (cur_train_ind,
                cur_test_ind) in tqdm(enumerate(kfold.split(X, y))):

            cur_X = X[cur_train_ind]
            cur_y = y[cur_train_ind]

            gpf.reset_default_graph_and_session()

            model = model_creation_fun()

            model.fit(cur_X, cur_y)

            cur_save_dir = join(save_dir, f'fold_{i + 1}')
            os.makedirs(cur_save_dir, exist_ok=True)

            model.save_model(cur_save_dir)

            cur_test_x = X[cur_test_ind]
            cur_test_y = y[cur_test_ind]

            log_liks = model.calculate_log_likelihood(cur_test_x, cur_test_y)
            marg_pred = pd.DataFrame(
                model.predict_marginal_probabilities(cur_test_x))

            marg_pred.to_csv(join(cur_save_dir, 'marginal_probs.csv'))
            pd.DataFrame(cur_test_y).to_csv(join(cur_save_dir, 'y_t.csv'))

            # I am also interested in the log loss.
            y_t_df = pd.DataFrame(cur_test_y)
            neg_log_loss_results = multi_class_eval(marg_pred, y_t_df,
                                                    neg_log_loss_with_labels,
                                                    'log_lik')

            neg_log_loss_results.to_csv(
                join(cur_save_dir, 'marginal_species_log_lik.csv'))

            pd.Series(neg_log_loss_results.mean()).to_csv(
                join(cur_save_dir, 'neg_log_loss_mean.csv'))

            fold_liks[i] = np.mean(log_liks)

            np.savez(join(cur_save_dir, 'cv_results'),
                     site_log_liks=log_liks,
                     cur_train_X=cur_X,
                     cur_train_y=cur_y,
                     cur_test_X=cur_test_x,
                     cur_test_y=cur_test_y,
                     train_ind=cur_train_ind,
                     test_ind=cur_test_ind)

        pd.Series({
            'mean_lik': np.mean(fold_liks)
        }).to_csv(join(save_dir, 'mean_lik.csv'))

        pd.Series(fold_liks, index=[f'fold_{i+1}' for i in range(n_folds)
                                    ]).to_csv(join(save_dir, 'fold_liks.csv'))

        return np.mean(fold_liks), np.std(fold_liks) / np.sqrt(len(fold_liks))
def dcv_clf(X, y, model, param_grid, niter):
    """
    Double cross validation (classification)

    Parameters
    ----------
    X : array-like, shape = [n_samples, n_features]
        X training+test data

    y : array-like, shape = [n_samples]
        y training+test data

    model: estimator object.
        This is assumed to implement the scikit-learn estimator interface.

    param_grid : dict or list of dictionaries
        Dictionary with parameters names (string) as keys and lists of
        parameter settings to try as values, or a list of such dictionaries,
        in which case the grids spanned by each dictionary in the list are
        explored.

    niter : int
        number of DCV iteration

    Returns
    -------
    None
    """
    # parameters
    ns_in = 3  # n_splits for inner loop
    ns_ou = 3  # n_splits for outer loop
    scores = np.zeros((niter, 5))
    for iiter in range(niter):
        ypreds = np.array([])  # list of predicted y in outer loop
        ytests = np.array([])  # list of y_test in outer loop
        kf_ou = KFold(n_splits=ns_ou, shuffle=True)

        # [start] outer loop for test of the generalization error
        for train_index, test_index in kf_ou.split(X):
            X_train, X_test = X[train_index], X[test_index]  # inner loop CV
            y_train, y_test = y[train_index], y[test_index]  # outer loop

            # [start] inner loop CV for hyper parameter optimization
            kf_in = KFold(n_splits=ns_in, shuffle=True)
            gscv = GridSearchCV(model, param_grid, cv=kf_in)
            gscv.fit(X_train, y_train)
            # [end] inner loop CV for hyper parameter optimization

            # test of the generalization error
            ypred = gscv.predict(X_test)
            ypreds = np.append(ypreds, ypred)
            ytests = np.append(ytests, y_test)

        # [end] outer loop for test of the generalization error
        tn, fp, fn, tp = confusion_matrix(ytests, ypreds).ravel()
        acc = accuracy_score(ytests, ypreds)
        scores[iiter, :] = np.array([tp, fp, fn, tn, acc])

    means, stds = np.mean(scores, axis=0), np.std(scores, axis=0)
    print()
    print('Double Cross Validation')
    print('In {:} iterations, average +/- standard deviation'.format(niter))
    print('TP   DCV: {:.3f} (+/-{:.3f})'.format(means[0], stds[0]))
    print('FP   DCV: {:.3f} (+/-{:.3f})'.format(means[1], stds[1]))
    print('FN   DCV: {:.3f} (+/-{:.3f})'.format(means[2], stds[2]))
    print('TN   DCV: {:.3f} (+/-{:.3f})'.format(means[3], stds[3]))
    print('Acc. DCV: {:.3f} (+/-{:.3f})'.format(means[4], stds[4]))
#    month_dummies = pd.get_dummies(train_test_features['month'], prefix='month', prefix_sep='_')  
#    if 'phase' in select_list:
#        phase_dummies = pd.get_dummies(train_test_features['phase'], prefix='phase', prefix_sep='_')
#        n_X_train_test_mer = pd.concat([n_X_train_test_pd, chargemode_dummies, hour_dummies, week_dummies, month_dummies,phase_dummies], axis=1)
#        n_X_train_test_mer.drop(['charge_mode', 'hour', 'week', 'month', 'phase'], axis=1, inplace=True)
#    else:
#        n_X_train_test_mer = pd.concat([n_X_train_test_pd, chargemode_dummies, hour_dummies, week_dummies, month_dummies], axis=1)
#        n_X_train_test_mer.drop(['charge_mode', 'hour', 'week', 'month'], axis=1, inplace=True)
    
    n_testB = n_X_train_test_mer.tail(selected_testB_features.shape[0])
    n_X_train = n_X_train_test_mer.drop(n_testB.index.tolist())
    return n_X_train, n_y_train, n_testB, y_scaler

    
ram_num = 5
kfolds = KFold(n_splits=10, shuffle=True, random_state=ram_num)
def cv_rmse(model, train, y_train):  
    rmse = np.sqrt(-cross_val_score(model, train, y_train, scoring="neg_mean_squared_error", cv = kfolds))
    return(rmse)
 
    
def ridge_selector(k, X, y):
    model = make_pipeline(RidgeCV(alphas = [k], cv=kfolds)).fit(X, y) 
    rmse = cv_rmse(model, X, y).mean()
    return(rmse)
    
    
def lasso_selector(k, X, y):  
    model = make_pipeline(LassoCV(max_iter=1e7, alphas = [k], 
                                 cv = kfolds)).fit(X, y) 
    rmse = cv_rmse(model, X, y).mean()
Exemple #51
0

if __name__ == '__main__':

    train_samples = pd.read_csv('train.csv')['fname'].values
    # f_tr, f_val = train_test_split(train_samples, test_size=0.1)

    import os
    from torch.utils.data import DataLoader
    from sklearn.model_selection import train_test_split, KFold

    with ignore(OSError):
        os.mkdir('checkpoints/naive')
    save_paths = [f'naive/resnet50_r{i:2d}' for i in range(10)]
    round_id = 0
    for ix_tr, ix_val in KFold(n_splits=10).split(train_samples):
        f_tr, f_val = train_samples[ix_tr], train_samples[ix_val]
        with timer('load data'):
            train_loader = DataLoader(DSet(f_tr),
                                      batch_size=128,
                                      shuffle=True,
                                      **kwargs)
            val_loader = DataLoader(DSet(f_val), batch_size=128, **kwargs)

        train(build_resnet50(), train_loader, val_loader, 300,
              save_paths[round_id])
        round_id += 1

    with timer('load test data'):
        sub = pd.read_csv('sample_submission.csv')
        test_loader = DataLoader(DSet(sub['fname'].values, 'test'),
num_folds = 10
scoring = "neg_mean_squared_error"
seed = 51
# Spot Check Algorithms
models = []
models.append(('LR', LinearRegression()))
models.append(('LASSO', Lasso()))
models.append(('EN', ElasticNet()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('SVR', SVR()))

results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=num_folds, random_state=seed)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(),   cv_results.std())
    print(msg)


# In[16]:


#fig = plt.figure()
#fig.suptitle('Algorithm Comparison')
#ax = fig.add_subplot(111)
#plt.boxplot(results)
#ax.set_xticklabels(names)
        if k == 1:
            all_X = X
        else:
            all_X = np.hstack((all_X, X))
        print('...............................................................................')

    # output the spectrum profile
    np.savetxt(featurename + 'Feature1.txt', all_X)

    # prediction based on spectrum profile
    print('###############################################################################')
    print('The prediction based on ' + featurename + ', beginning')
    tic = time.clock()

    clf = XGBClassifier(learning_rate=0.05, n_estimators=20, max_depth=4, objective='binary:logistic')
    folds = KFold(10, True, 1)
    getshapvalue(all_X, y, clf)
    auc_score, accuracy, sensitivity, specificity, MCC = getCrossValidation(all_X, y, clf, folds)

    print('results for feature:' + featurename)
    print('****AUC score:%.3f, accuracy:%.3f, sensitivity:%.3f, specificity:%.3f, MCC:%.3f****' % (
        auc_score, accuracy, sensitivity, specificity, MCC))

    toc = time.clock()
    print('The prediction time: %.3f minutes' % ((toc - tic) / 60.0))
    print('###############################################################################\n')

    # output result
    results = DataFrame({'Feature': [featurename], \
                         'AUC': [auc_score], \
                         'ACC': [accuracy], \
Exemple #54
0
style.use("ggplot")
from sklearn import svm
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, f1_score, matthews_corrcoef

#loading breast cancer data from UCI ML Repo
url = "https://goo.gl/AP7kzV"
raw_data = urllib.request.urlopen(url)
dataset = np.genfromtxt(raw_data, delimiter=",")

#features are first 9, and classification is the 10th
X = dataset[:,0:10]
y = dataset[:,10]

#splits data into 5 chunks for N fold cross validation
k_fold = KFold(n_splits=5)
#below chunk is to test which are testing and training
#for train_indices, test_indices in k_fold.split(X):
    #print('Train: %s | test: %s' % (train_indices, test_indices))

svc = svm.SVC(C=1, kernel='linear')

#to change all nan values to 0
X[(np.isnan(X))] = 0

#loop to do 5 fold cross validation, stores scores in array
# scores = [svc.fit(X[train], y[train]).score(X[test], y[test])
#     for train, test in k_fold.split(X)]
# print(scores)
i = 1
Exemple #55
0
y_all = np.concatenate((y_train, y_dev))

#X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

X_all = np.concatenate((X_train, X_dev))
y_all = np.concatenate((y_train, y_dev))

seed = 7
numpy.random.seed(seed)
idx = np.random.permutation(len(X_all))
X_all = X_all[idx]
y_all = y_all[idx]
from sklearn.model_selection import KFold

#test_fold = [-1]*len(X_train)+[1]*len(X_dev)
kf = KFold(n_splits=10)

# create model

# define the grid search parameters

conv_layers = [3]
conv_units = [256]
lr = [5e-4]
drop = [0.3]
i = 0
for t, v in kf.split(X_all):
    i += 1
    X_train, y_train, X_dev, y_dev = X_all[t], y_all[t], X_all[v], y_all[v]
    #X_train_em,X_dev_em=X_all_em[t],X_all_em[v]
    model = create_model(conv_layers=4, conv_units=256, lr=5e-4, drop=0.3)
    print("Size of x:",len(x)," Size of y:",len(radical)," Positive : ",radicalOne)
    X = []
    for t in x:
        t = re.sub(r'[^\w\s]',' ',t)
        t = ' '.join([word for word in t.split() if word != " "])
        t = t.lower()
        t = ' '.join([word for word in t.split() if word not in cachedStopWords])
        X.append(t)


with timer("making Tokeniser"):
    print("Type of X:",type(X)) 
    Features = X
    Radical = radical

    kf = KFold(n_splits=10)
    iteration = 0
    gRadicalAccu = 0
    gPrecision = [0,0]
    gRecall = [0,0]
    gFScore = [0,0]

    vocabSize = len(allEnglishWords)
    tokenizer = Tokenizer(num_words= vocabSize)
    tokenised = tokenizer.fit_on_texts(allEnglishWords)


    gPositivePredRadical = 0


with timer("Cross Validation"):
 def __init__(self, n_splits=2, shuffle=False):
     self.n_splits = n_splits
     if self.n_splits > 1:
         self.k_fold = KFold(n_splits=n_splits, shuffle=shuffle)
Exemple #58
0
        return None, loss, hold_dict, out_prob

    global_step = tf.Variable(0.0, trainable=False, name="global_step")
    train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(
        loss, global_step)

    tf.summary.scalar("loss", loss)

    return train_op, loss, hold_dict, out_prob


if __name__ == "__main__":
    train_x, train_y, test_x, test_id = get_data()
    train_y = train_y[:, None]
    train_x_std, test_x_std, _ = scale_data(train_x, test_x)
    k_fold = KFold(n_splits=5, shuffle=True, random_state=seed)

    tf.set_random_seed(seed)

    with tf.Graph().as_default():
        with tf.name_scope("train"):
            with tf.variable_scope("dnn", reuse=None):
                train_op, train_loss, train_holder, train_res = dnn_network(
                    keep_prob=keep_prob, is_training=True)
        with tf.name_scope("test"):
            with tf.variable_scope("dnn", reuse=True):
                _, test_loss, test_holder, test_res = dnn_network(
                    keep_prob=keep_prob, is_training=False)

        saver = tf.train.Saver()
        summary = tf.summary.merge_all()
Exemple #59
0
    if img.shape[0] == 1:
        img = img[0][0]
    if img.shape[0] == 3:
        img = np.moveaxis(img, 0, 2)
    X_images_.append(img)
    shape = (img.shape[1], img.shape[0])
    mask = enc2mask(data.loc[data.id == data.id[i], "encoding"].values[0], shape)
    Masks_.append(mask)
    image_dims_.append(img.shape)
    print(img.shape)

del img, mask, shape
gc.collect()

indexes = [i for i in range(15)]
kf = KFold(n_splits=5, shuffle=True, random_state=2021)
sum_masks = []
k = 0
for fold, (train_index, val_index_) in enumerate(kf.split(indexes)):
    print('Train fold ', fold, 'val indexes = ', val_index_)
    with open(f"../{model_name}/{model_name}.log", 'a+') as logger:
        logger.write(f'fold {fold} val index {val_index_}\n')
    masks = train_fold(val_index_, X_images_, Masks_, image_dims_, fold, train=train, predict=predict)
    if len(sum_masks) == 0:
        sum_masks = masks
    else:
        for i in range(len(sum_masks)):
            if predict:
                sum_masks[i] = sum_masks[i] + masks[i]
    k += 1
    del masks
Exemple #60
0
def rbf_svc_fs98(finC_x, finC_y, finT_x, finT_y):

    dataC_x_train, dataC_x_test, dataC_y_train, dataC_y_test = train_test_split(
        finC_x, finC_y, test_size=0.1)

    dataT_x_train, dataT_x_test, dataT_y_train, dataT_y_test = train_test_split(
        finT_x, finT_y, test_size=0.1)

    estimator_c1 = LinearSVC()
    selector_c1 = RFE(estimator_c1, 10000, step=0.1)
    new_x_c1 = selector_c1.fit_transform(dataC_x_train,
                                         np.ravel(dataC_y_train))

    estimator_t1 = LinearSVC()
    selector_t1 = RFE(estimator_t1, 10000, step=0.1)
    new_x_t1 = selector_t1.fit_transform(dataT_x_train,
                                         np.ravel(dataT_y_train))

    new_x = pd.concat([pd.DataFrame(new_x_c1), pd.DataFrame(new_x_t1)], axis=1)

    best_acc = []

    K = 10
    kf = KFold(n_splits=K)

    for num in features_num:

        print('selected num of features: ', num)

        estimator_c = LinearSVC()
        selector_c = RFE(estimator_c, num, step=0.1)
        new_x_c = selector_c.fit_transform(new_x_c1, np.ravel(dataC_y_train))

        estimator_t = LinearSVC()
        selector_t = RFE(estimator_t, num, step=0.1)
        new_x_t = selector_t.fit_transform(new_x_t1, np.ravel(dataT_y_train))

        estimator_ = LinearSVC()
        selector_ = RFE(estimator_, num, step=0.1)
        new_x_ = selector_.fit_transform(new_x, np.ravel(dataT_y_train))

        new_x = pd.concat([
            pd.DataFrame(new_x_),
            pd.concat([pd.DataFrame(new_x_c),
                       pd.DataFrame(new_x_t)], axis=1)
        ],
                          axis=1)

        cv_accur = 0
        cv_sd = 0

        accur_total = 0
        accur_list = []

        for train_index, test_index in kf.split(new_x):
            data_x_train, data_x_test = new_x.values[
                train_index], new_x.values[test_index]
            data_y_train, data_y_test = finC_y.values[
                train_index], finC_y.values[test_index]
            data_y_train = np.ravel(data_y_train)
            data_y_test = np.ravel(data_y_test)

            accur = np.zeros(num_costs)

            for i in range(num_costs):
                model = SVC(gamma=cost_range[i], kernel='rbf')
                model.fit(data_x_train, data_y_train)
                pred = model.predict(data_x_test)
                accur[i] = accuracy_score(data_y_test, pred)

            accur_total += np.max(accur)
            accur_list.append(np.max(accur))

        cv_accur = accur_total / K
        cv_sd = np.std(accur_list)

        print('Accuracy = ', cv_accur, 'std = ', cv_sd)

    best_acc.append(cv_accur)

    return best_acc