Example #1
0
    def objective(params):
        params['num_leaves'] = int(params['num_leaves'])
        params['bagging_freq'] = int(params['bagging_freq'])
        params['max_depth'] = int(params['max_depth'])
        skf = cross_validation.StratifiedKFold(
            y_train,  # Samples to split in K folds
            n_folds=5,  # Number of folds. Must be at least 2.
            shuffle=
            True,  # Whether to shuffle each stratification of the data before splitting into batches.
            random_state=
            423  # pseudo-random number generator state used for shuffling
        )
        boost_rounds = []
        score = []

        for train, test in skf:
            _train_x, _test_x, _train_y, _test_y = \
                x_train.iloc[train], x_train.iloc[test], y_train[train], y_train[test]

            train_lgb = lgb.Dataset(np.array(_train_x), np.array(_train_y))
            test_lgb = lgb.Dataset(np.array(_test_x),
                                   np.array(_test_y),
                                   reference=train_lgb)

            model = lgb.train(params,
                              train_lgb,
                              num_boost_round=10000,
                              valid_sets=test_lgb,
                              early_stopping_rounds=300)

            boost_rounds.append(model.best_iteration)
            score.append(model.best_score)
            #score.append(-verify_accuracy(binary_predict(model.predict(_test_x), 0.5), _test_y))

        # print('nb_trees={} val_loss={}'.format(boost_rounds, score))
        # print(len(score))
        mean_score = np.mean(
            [list(score[k]['valid_0'].values())[0] for k in range(len(score))])
        #mean_score = np.mean(score)

        # print('average of best iteration:', np.average(boost_rounds))
        return {'loss': mean_score, 'status': STATUS_OK}
def test_kfold_can_detect_dependent_samples_on_digits():  # see #2372
    # The digits samples are dependent: they are apparently grouped by authors
    # although we don't have any information on the groups segment locations
    # for this data. We can highlight this fact be computing k-fold cross-
    # validation with and without shuffling: we observe that the shuffling case
    # wrongly makes the IID assumption and is therefore too optimistic: it
    # estimates a much higher accuracy (around 0.96) than than the non
    # shuffling variant (around 0.86).

    digits = load_digits()
    X, y = digits.data[:800], digits.target[:800]
    model = SVC(C=10, gamma=0.005)
    n = len(y)

    cv = cval.KFold(n, 5, shuffle=False)
    mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
    assert_greater(0.88, mean_score)
    assert_greater(mean_score, 0.85)

    # Shuffling the data artificially breaks the dependency and hides the
    # overfitting of the model with regards to the writing style of the authors
    # by yielding a seriously overestimated score:

    cv = cval.KFold(n, 5, shuffle=True, random_state=0)
    mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
    assert_greater(mean_score, 0.95)

    cv = cval.KFold(n, 5, shuffle=True, random_state=1)
    mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
    assert_greater(mean_score, 0.95)

    # Similarly, StratifiedKFold should try to shuffle the data as little
    # as possible (while respecting the balanced class constraints)
    # and thus be able to detect the dependency by not overestimating
    # the CV score either. As the digits dataset is approximately balanced
    # the estimated mean score is close to the score measured with
    # non-shuffled KFold

    cv = cval.StratifiedKFold(y, 5)
    mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
    assert_greater(0.88, mean_score)
    assert_greater(mean_score, 0.85)
Example #3
0
def sample_fold_indices(table, folds=10, stratified=False, random_state=None):
    """
    :param Orange.data.Table table:
    :param int folds: Number of folds
    :param bool stratified: Return stratified indices (if applicable).
    :param Random random_state:
    :rval tuple-of-arrays: A tuple of array indices one for each fold.
    """
    n = len(table)
    if stratified and is_discrete(table.domain.class_var):
        # XXX: StratifiedKFold does not support random_state
        ind = cross_validation.StratifiedKFold(
            table.Y.ravel(), folds,  # random_state=random_state
        )
    else:
        ind = cross_validation.KFold(
            n, folds, shuffle=True, random_state=random_state
        )

    return tuple(ind)
Example #4
0
def tune_parameters(data, labels):
    """
    Tune the parameters using exhaustive grid search
    """
    # set cv here, why not
    cv = cross_validation.StratifiedKFold(labels, n_folds=5, shuffle=True)

    pipeline = Pipeline([('normaliser', preprocessing.Normalizer()),
                         ('svm', SVC(kernel='poly', gamma=1,
                                     cache_size=1000))])

    # can test multiple kernels as well if desired
    #param_grid = [{'kernel': 'poly', 'coef0': [1, 5, 10, 20], 'degree': [2, 3, 4, 5, 10]}]
    param_grid = [{'svm__coef0': [1, 2, 3, 4, 5], 'svm__degree': [2, 3, 4, 5]}]
    clf = GridSearchCV(pipeline, param_grid, n_jobs=-1, cv=cv)
    clf.fit(data, labels)

    print 'best parameters found:'
    print clf.best_estimator_
    return clf.best_estimator_
Example #5
0
def main():
    X, Y = utils.read_data("../files/train_10.csv")
    n_target = len(set(Y))
    Y = map(int, Y)
    folds = 5
    stf = cross_validation.StratifiedKFold(Y, folds)
    loss = []
    accs = []
    classMap = sorted(list(set(Y)))
    X, Y = np.array(X), np.array(Y)
    print "Testing..."
    for i, (train, test) in enumerate(stf):
        X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[test]
        probs = [[0.001 for x in range(n_target)] for y in range(len(y_test))]
        loss.append(utils.logloss(probs, y_test, classMap))
        accs.append(utils.accuracy([1] * len(y_test), y_test))
        print "Accuracy(Fold {0}): ".format(i) + str(accs[len(accs) - 1])
        print "Loss(Fold {0}): ".format(i) + str(loss[len(loss) - 1])
    print "Mean Accuracy: " + str(np.mean(accs))
    print "Mean Loss: " + str(np.mean(loss))
    def __init__(self, dataframe, base_cv=None, **cv_kwargs):
        # We create a copy of the dataframe with a new last level
        # index which is an enumeration of the rows (like proper indices)
        self.all_segments = pd.DataFrame({
            'Preictal': dataframe['Preictal'],
            'i': np.arange(len(dataframe))
        })
        self.all_segments.set_index('i', append=True, inplace=True)

        # Now create a series with only the segments as rows. This is what we will pass into the wrapped cross
        # validation generator
        self.segments = self.all_segments['Preictal'].groupby(
            level='segment').first()
        self.segments.sort(inplace=True)

        if base_cv is None:
            self.cv = cross_validation.StratifiedKFold(self.segments,
                                                       **cv_kwargs)
        else:
            self.cv = base_cv(self.segments, **cv_kwargs)
Example #7
0
def train_classifier(predictors,
                     response,
                     feature_names=relevant_feature_names,
                     tuned_clf=Clf.LINEAR_SVC,
                     param_grid=None,
                     test_size=0.5,
                     scoring=weighted_f1,
                     random_state=0):
    param_grid = param_grid or default_param_grid(tuned_clf)
    kf_cv = cross_validation.StratifiedKFold(response,
                                             n_folds=10,
                                             shuffle=True,
                                             random_state=random_state)
    cv_clf = GridSearchCV(estimator=tuned_clf,
                          param_grid=param_grid,
                          cv=kf_cv,
                          scoring=scoring)
    cv_clf.fit(predictors, response)

    return cv_clf
Example #8
0
    def tune_parameters(data, labels):
        """
        Tune the parameters using exhaustive grid search
        """
        # set cv here, why not
        cv = cross_validation.StratifiedKFold(labels, n_folds=5, shuffle=True)

        pipeline = Pipeline([('normaliser', preprocessing.Normalizer()),
                             ('svm', SVC(kernel='poly', cache_size=1000))])

        param_grid = [{'svm__coef0': [1, 2, 3, 4, 5], 'svm__degree': [2, 3, 4, 5], 'svm__C': [1, 2],
                       'svm__gamma': [0, 1]}]

        print 'tuning params'
        clf = GridSearchCV(pipeline, param_grid, n_jobs=-1, cv=cv)
        clf.fit(data, labels)

        print 'best parameters found:'
        print clf.best_estimator_
        return clf.best_estimator_
Example #9
0
def cv_run(rd, X, y):
    print "X:", X.shape, "y:", y.shape
    n_cv = 16
    #cv1 = cross_validation.KFold(len(y), n_folds=n_cv, random_state=random_state)
    cv1 = cross_validation.StratifiedKFold(y, n_folds=n_cv)
    scores = cross_validation.cross_val_score(
        rd,
        X,
        y,
        cv=cv1,
        scoring='roc_auc',
        #scoring=make_scorer(roc_auc_score),
        n_jobs=-1,
        verbose=1)
    print "scores:", scores
    print "%d Fold CV Score: %.6f +- %.4f" % (
        n_cv,
        np.mean(scores),
        2 * np.std(scores),
    )
Example #10
0
def get_comb_models(traindata, targets, crossval=True):
    # traindata: list with NumExamples * NumOutputs(=10) array with length 'no. preprocessors'
    # reshape to NumExamples * [NumPreprocessors * NumOutputs]
    traindata = np.array(traindata).transpose((1, 0, 2))
    traindata = np.reshape(traindata, [traindata.shape[0], -1])

    # needs to be not one-hot
    targets = targets.argmax(axis=1)

    models = [
        linear_model.LogisticRegression(penalty='l1',
                                        dual=False,
                                        C=5.,
                                        fit_intercept=False),
        linear_model.LogisticRegression(penalty='l2',
                                        dual=False,
                                        C=10.,
                                        fit_intercept=False),
        linear_model.LogisticRegression(penalty='l2',
                                        dual=False,
                                        C=20.,
                                        fit_intercept=True)
    ]

    if crossval:
        # use StratifiedKFold, because survived 0/1 is not evenly distributed
        cv = cross_validation.StratifiedKFold(targets, n_folds=5)
        scores = [0] * len(models)

    for ii in range(len(models)):
        if crossval:
            # get scores
            scores[ii] = cross_validation.cross_val_score(models[ii], traindata, targets, \
                        cv=cv, n_jobs=1, scoring='accuracy')
            print "Cross-validation accuracy on the training set for model %d:" % ii
            print "%0.3f (+/-%0.03f)" % (scores[ii].mean(),
                                         scores[ii].std() / 2)
        else:
            models[ii].fit(traindata, targets)

    return models
def repeated_cross_fold_validation(models, n=10, k=5):
    """ Run cross validation on a set of models n times

    All models are tested using the same cross validation splits
    at each iteration.

    Args:
        models: List of dictionaries containing the model
        and training or testing data.
        n: number of iterations to repeat cross validation (default 10)
        k: number of folds to use at each iteration (default 5)
    Returns:
        A list of scorer objects of type ROCAnalysisScorer, one for each model
        passed.
    """

    scorers = {}

    for i in range(n):
        # create a new cross validation set for each iteration & test.
        skf = cross_validation.StratifiedKFold(models[0]['train_data'][1],
                                               n_folds=k)

        for model in models:
            model_name = model['name']
            if model_name not in scorers:
                scorers[model_name] = ROCAnalysisScorer()

            results = score_pipeline(model, cv=skf)

            # for each model collect the results into a single scorer.
            # note: no average is made at this stage. The results of each
            # of the k folds is collected into a single k * n list for
            # the model.
            scorers[model_name].f1scores_ += results[0].f1scores_
            scorers[model_name].f2scores_ += results[0].f2scores_
            scorers[model_name].fhalf_scores_ += results[0].fhalf_scores_
            scorers[model_name].rates_ += results[0].rates_
            scorers[model_name].aucs_ += results[0].aucs_

    return scorers
def do_xgb_MOE(num_points_to_sample, X_train, y_train, verbose=True, **kwargs):
    # Finding Best XGB parameters using MOE
    xgb_parameters = {}
    # Range of XGBoost parameters that are optimized
    exp_xgb = Experiment(
        [[0.1, 1], [0.002, 1]], [0.01, 1]
    )  # learning_rate_range = [0.1, 1]; n_estimators_range = [2, 1000] is normalized
    # max_depth_range = [1, 100] is normalized

    n_folds = 10
    cv_folds = cross_validation.StratifiedKFold(y_train, n_folds=n_folds)

    best_point = []
    best_point_value = 0.
    for _ in range(num_points_to_sample):
        # Use MOE to determine what is the point with highest Expected Improvement to use next
        next_point_to_sample = gp_next_points(
            exp_xgb, rest_host='localhost', rest_port=6543,
            **kwargs)[0]  # By default we only ask for one point

        # Sample the point from objective function
        xgb_parameters['learning_rate'] = next_point_to_sample[0]
        xgb_parameters['n_estimators'] = int(
            round(next_point_to_sample[1] * 1000))
        xgb_parameters['max_depth'] = int(round(next_point_to_sample[2] * 100))
        acc_cv, prec_cv, rec_cv, cm_cv, cm_full_cv = xgboost_cross_validation(
            X_train, y_train, xgb_parameters, cv_folds)
        value_of_next_point = acc_cv
        if value_of_next_point > best_point_value:
            best_point_value = value_of_next_point
            best_point = next_point_to_sample
        if verbose:
            print "Sampled f({0:s}) = {1:.18E}".format(
                str(next_point_to_sample), value_of_next_point)
        # Add the information about the point to the experiment historical data to inform the GP
        exp_xgb.historical_data.append_sample_points(
            [SamplePoint(next_point_to_sample, -value_of_next_point,
                         0.0001)])  # We can add some noise
    best_point[1] = int(round(best_point[1] * 1000))
    best_point[2] = int(round(best_point[2] * 100))
    return best_point, best_point_value
Example #13
0
def compute_auc(gram_matrix, data, k=10, C=1.0):

    kv = cross_validation.StratifiedKFold(labels, n_folds=k)
    s = 0.0

    for train_index, test_index in kv:

        gm_train = gram_matrix[train_index, :]
        gm_train = gm_train[:, train_index]
        data_train = data[train_index]

        # libSVM wants the distances from test instances to all train instances as input
        # see http://stackoverflow.com/questions/10978261/libsvm-precomputed-kernels
        gm_test = gram_matrix[test_index, :]
        gm_test = gm_test[:, train_index] #!
        data_test = data[test_index]

        # Have to use libsvm directly here, because of a bug in sklearn with precomputed gram matrices
        x = []
        for i in range(len(gm_train)):
            l = gm_train[i].tolist()
            l.insert(0, i + 1)
            x.append(l)

        prob = svmutil.svm_problem(data_train.tolist(), x, isKernel=True)
        param = svmutil.svm_parameter("-t 4 -c %.410f -q" % C)
        m = svmutil.svm_train(prob, param)

        xx = []
        for i in range(len(gm_test)):
            t = gm_test[i].tolist()
            t.insert(0, i + 1)
            xx.append(t)

        p_label, p_acc, p_val = svmutil.svm_predict(data_test.tolist(), xx, m)

        fpr, tpr, thresholds = roc_curve(data_test, p_val, pos_label=1.0)
        AUC = roc_auc_score(data_test, p_val)
        s += AUC

    return s / k
Example #14
0
def generate_model(data, classes, args):

    # Define the parameters
    tuned_parameters = {'C': C_RANGE, 'class_weight': CLASS_WEIGHTS}

    # Define the classifier
    clf = linear_model.LogisticRegression(max_iter=SCORE_MAX_ITER,
                                          n_jobs=args.cores)

    print_verbose("Classifier: %s" % str(clf), 5)
    print_verbose("Parameters: %s" % str(tuned_parameters), 5)

    # Generate the K-fold development
    skf = cross_validation.StratifiedKFold(classes,
                                           n_folds=SCORE_K_FOLD,
                                           shuffle=True)
    print_verbose("KFold: %s" % str(skf), 5)

    gscv = grid_search.GridSearchCV(clf,
                                    tuned_parameters,
                                    cv=skf,
                                    scoring='mean_squared_error',
                                    n_jobs=1,
                                    verbose=get_verbose_level())

    # Search
    print_verbose("GridSearch: %s" % str(gscv), 5)
    gscv.fit(data, classes)

    # Print scores
    print_verbose("GridSearch scores:", 5)
    for params, mean_score, scores in gscv.grid_scores_:
        print_verbose(
            "%0.6f (+/-%0.06f) for %r" %
            (mean_score, scores.std() / 2, params), 5)

    # Print best score
    print_verbose("GridSearch best score:", 0)
    print_verbose("%0.6f for %r" % (gscv.best_score_, gscv.best_params_), 0)

    return gscv
def getBestThreshold(features, labels_pooled, labels_current):
    print("length of pooled and current", len(labels_pooled),
          len(labels_current))
    maxent = LogisticRegression(penalty='l1')
    scores = {"F1": [], "Recall": [], "Accuracy": [], "Precision": []}
    thresholds = []

    print('Finding best thresholds...')
    fold = 1
    #    for TrainIndices, TestIndices in cross_validation.StratifiedKFold(labels_pooled, n_folds=2, shuffle=False, random_state=None):
    for TrainIndices, TestIndices in cross_validation.StratifiedKFold(
            labels_pooled, n_folds=10, shuffle=False, random_state=None):
        #    for TrainIndices, TestIndices in cross_validation.KFold(n=features.shape[0], n_folds=10, shuffle=False, random_state=None):
        print('\r' + str(fold), end="")
        fold += 1
        TrainX_i = features[TrainIndices]
        Trainy_i = labels_pooled[TrainIndices]

        TestX_i = features[TestIndices]
        Testy_i = labels_current[TestIndices]

        maxent.fit(TrainX_i, Trainy_i)
        #get prediction
        thresh_i, ypred_i, score = optimize_threshold(maxent, TestX_i, Testy_i)
        thresholds.append(thresh_i)

        scores["F1"].append(score[0])
        scores["Recall"].append(score[1])
        scores["Accuracy"].append(score[2])
        scores["Precision"].append(score[3])

    #scores = cross_validation.cross_val_score(maxent, features, labels, cv=10)
    print("\n--")

    for key in sorted(scores.keys()):
        currentmetric = np.array(scores[key])
        print("%s : %0.2f (+/- %0.2f)" %
              (key, currentmetric.mean(), currentmetric.std()))
    print("--")

    return maxent, np.array(thresholds)
Example #16
0
def getBestThreshold(X, y_current_tr, y_current_te, regularization='l2'):
    assert len(X) == len(y_current_tr) == len(
        y_current_te
    ), 'Number of features ({}), annotator1 labels ({}) and annotator2 labels ({}) is not equal!'.format(
        len(X), len(y_current_tr), len(y_current_te))
    maxent = LogisticRegression(penalty=regularization)
    scores = {"F1": [], "Recall": [], "Accuracy": [], "Precision": []}
    thresholds = []

    print('Finding best thresholds...')
    fold = 1
    for TrainIndices, TestIndices in cross_validation.StratifiedKFold(
            y_current_tr, n_folds=10, shuffle=False, random_state=None):
        print('\r' + str(fold), end="")
        fold += 1
        TrainX_i = X[TrainIndices]
        Trainy_i = y_current_tr[TrainIndices]

        TestX_i = X[TestIndices]
        Testy_i = y_current_te[TestIndices]

        maxent.fit(TrainX_i, Trainy_i)
        #get prediction
        thresh_i, ypred_i, score = optimize_threshold(maxent, TestX_i, Testy_i)
        thresholds.append(thresh_i)

        scores["F1"].append(score[0])
        scores["Recall"].append(score[1])
        scores["Accuracy"].append(score[2])
        scores["Precision"].append(score[3])

    #scores = cross_validation.cross_val_score(maxent, features, labels, cv=10)
    print("\n--")

    for key in sorted(scores.keys()):
        currentmetric = np.array(scores[key])
        print("%s : %0.2f (+/- %0.2f)" %
              (key, currentmetric.mean(), currentmetric.std()))
    print("--")

    return maxent, np.array(thresholds)
Example #17
0
def classify_ads(Xy):
    classifier = BernoulliNB()
    cv = cross_validation.StratifiedKFold(Xy[1], 2)
    precision = []
    recall = []

    for train, test in cv:
        X_train = Xy[0][train]
        X_test = Xy[0][test]
        y_train = Xy[1][train]
        y_test = Xy[1][test]
        classifier.fit(X_train, y_train)
        y_hat = classifier.predict(X_test)
        p, r, _, _ = metrics.precision_recall_fscore_support(y_test, y_hat)
        precision.append(p[1])
        recall.append(r[1])

    print classifier
    print 'precision:', np.average(precision), '+/-', np.std(precision)
    print 'recall:', np.average(recall), '+/-', np.std(recall)
    return classifier
Example #18
0
    def _calculate(self, X, y, categorical, metafeatures, helpers):
        import sklearn.naive_bayes

        if len(y.shape) == 1 or y.shape[1] == 1:
            kf = cross_validation.StratifiedKFold(y, n_folds=10)
        else:
            kf = cross_validation.KFold(y.shape[0], n_folds=10)

        accuracy = 0.
        for train, test in kf:
            nb = sklearn.naive_bayes.GaussianNB()

            if len(y.shape) == 1 or y.shape[1] == 1:
                nb.fit(X[train], y[train])
            else:
                nb = OneVsRestClassifier(nb)
                nb.fit(X[train], y[train])

            predictions = nb.predict(X[test])
            accuracy += sklearn.metrics.accuracy_score(predictions, y[test])
        return accuracy / 10
Example #19
0
File: stacker.py Project: omdv/brew
    def fit_layer(self, layer_idx, X, y):
        if layer_idx >= len(self.layers):
            return
        elif layer_idx == len(self.layers) - 1:
            self.layers[layer_idx].fit(X, y)
        else:
            n_classes = len(set(y)) - 1
            n_classifiers = len(self.layers[layer_idx])
            output = np.zeros((X.shape[0], n_classes * n_classifiers))
            skf = cross_validation.StratifiedKFold(y, self.cv)
            for tra, tst in skf:
                self.layers[layer_idx].fit(X[tra], y[tra])
                out = self.layers[layer_idx].output(X[tst], mode=self.mode)
                if self.mode in ['probs', 'votes']:
                    output[tst, :] = out[:, 1:, :].reshape(
                        out.shape[0], (out.shape[1] - 1) * out.shape[2])
                elif self.mode in ['labels']:
                    output[tst, :] = out

            self.layers[layer_idx].fit(X, y)
            self.fit_layer(layer_idx + 1, output, y)
Example #20
0
def grid_search(estimator,
                data,
                featTypes=('BoW', ),
                nFolds=10,
                random_seed=44,
                param_grid=()):

    labels = [x.severity for x in data]

    generatePrimaryFeats(data, featTypes)

    featurized = []
    for d in data:
        instance = {}
        for featname, values in d.feats.items():
            # Give each feature a unique name to avoid overwriting features.
            # If e.g. a concept feature has the same name as a bow word, the old code
            # would overwrite one of the features.
            instance.update(
                {"{0}-{1}".format(featname, k): v
                 for k, v in values.items()})

        featurized.append(instance)

    d = DictVectorizer()
    x_train = d.fit_transform(featurized)

    folds = cross_validation.StratifiedKFold(labels,
                                             n_folds=nFolds,
                                             shuffle=True,
                                             random_state=random_seed)
    grid = GridSearchCV(estimator,
                        param_grid=param_grid,
                        scoring="f1",
                        n_jobs=-1,
                        cv=folds)
    fit_grid = grid.fit(x_train, labels)

    print(fit_grid.best_params_)
    return fit_grid.best_params_
Example #21
0
def svm_dummy_comparison(inputfile):
    x, y, labels = load_csv_svm(inputfile)
    x_scaled = preprocessing.scale(x)

    if USE_PCA:
        pca = PCA(n_components=PCA_COMPONENTS)
        x = pca.fit_transform(x_scaled)
        print(pca.explained_variance_ratio_)
    else:
        x = x_scaled

    visual_svm_clf = svm.SVC(gamma=GAMMA,
                             C=C,
                             class_weight=WEIGHT,
                             kernel=KERNEL,
                             cache_size=400)  # gamma=.01, C=.01,
    dummy_svm_clf = DummyClassifier(
        strategy='most_frequent',
        random_state=0)  # most_frequent, uniform, stratified

    cv = cross_validation.StratifiedKFold(y, 30)
    #cv = cross_validation.LeaveOneOut(len(y))
    metric = 'f1'  # accuracy, precision, recall, f1

    visual_scores = cross_validation.cross_val_score(visual_svm_clf,
                                                     x,
                                                     y,
                                                     cv=cv,
                                                     scoring=metric)
    dummy_scores = cross_validation.cross_val_score(dummy_svm_clf,
                                                    x,
                                                    y,
                                                    cv=cv,
                                                    scoring=metric)

    print(metric)
    # print('real_scores: {0}'.format(visual_scores))
    print('avg_real: {0}'.format(np.mean(visual_scores)))
    # print('dummy_scores: {0}'.format(dummy_scores))
    print('avg_dumb: {0}'.format(np.mean(dummy_scores)))
Example #22
0
def NoveltyDetectionFolds(folder,
                          n_folds=2,
                          trgt=None,
                          dev=False,
                          verbose=False):
    if n_folds < 2:
        print 'Invalid number of folds'
        return -1

    if not dev:
        file_name = '%s/%i_folds_cross_validation.jbl' % (folder, n_folds)
    else:
        file_name = '%s/%i_folds_cross_validation_dev.jbl' % (folder, n_folds)

    if not os.path.exists(file_name):
        if verbose:
            print "Creating %s" % (file_name)

        if trgt is None:
            print 'Invalid trgt'
            return -1

        CVO = {}
        for inovelty, novelty_class in enumerate(np.unique(trgt)):
            process_trgt = trgt[trgt != novelty_class]
            CVO[inovelty] = cross_validation.StratifiedKFold(
                process_trgt, n_folds)
            CVO[inovelty] = list(CVO[inovelty])
        if verbose:
            print 'Saving in %s' % (file_name)

        joblib.dump([CVO], file_name, compress=9)

    else:
        if verbose:
            print "Reading from %s" % (file_name)

        [CVO] = joblib.load(file_name)

    return CVO
Example #23
0
def trainAndEvaluateANN(features, labels, connRate, hidNodes, error):
    """
		Train and evaluate a neural network on the given features
		with the given attributes. 3-fold cross-validation is used
		on each run, the average accuracy, precision, recall, and fmeasure
		of all three folds is returned. 
	"""

    # Create 3-fold cross validation indices
    skf = cross_validation.StratifiedKFold(labels)
    binary = LabelBinarizer()

    accuracySum = 0
    totalResults = []
    totalTargets = []

    # For each k-fold split
    for trainIndex, testIndex in skf:
        # Get data split
        featuresTrain, featuresTest = features[trainIndex], features[testIndex]
        labelsTrain, labelsTest = labels[trainIndex], labels[testIndex]

        # Train the neural network
        ann = trainANN(featuresTrain, labelsTrain, connRate, hidNodes, error,
                       binary)

        # Evaluate ANN on test data
        accuracy, outputLabels = evaluateANN(featuresTest, labelsTest, ann,
                                             binary)

        accuracySum += accuracy

        # Store the results / targets for larger analysis
        totalResults.extend(outputLabels.tolist())
        totalTargets.extend(labelsTest.tolist())

    # Generate performance report
    report = classification_report(totalTargets, totalResults)

    return (accuracySum / 3.0, report)
def compute_cross_correlation_score(df,
                                    clfs,
                                    preprocess_scaling=True,
                                    nFold=10):
    """
    return an iterator with cross validation data
    :param df:
    :param clfs:
    :param preprocess_scaling:
    :param nFold:
    :return:
    """

    to_sklearn_features = DataFrameMapper([
        ('features', sklearn.feature_extraction.DictVectorizer())
    ])

    data_X = to_sklearn_features.fit_transform(df)
    data_Y = df.expected_class

    skf = cross_validation.StratifiedKFold(data_Y, n_folds=nFold)
    classification_results = []
    scores = []
    for num, (train_index, test_index) in enumerate(skf):
        X_train, X_test = data_X[train_index], data_X[test_index]
        Y_train, Y_test = data_Y[train_index], data_Y[test_index]
        print("Len train{}, Len test{}".format(Y_train.size, Y_test.size))
        cross_valid_data = Cross_validation_split(X_train, X_test, Y_train,
                                                  Y_test)
        cross_valid_data = preprocess(cross_valid_data,
                                      preprocess_scaling=preprocess_scaling,
                                      preprocess_correlation=False)

        for clf in clfs:
            score, classification = generate_score(clf,
                                                   cross_valid_data,
                                                   fold=num)
            scores.append(score)
            classification_results.append(classification)
    return scores, classification_results
Example #25
0
def eval_dag(dag, filename, dag_id=None):

    dag = normalize_dag(dag)
    # utils.draw_dag(dag)
    # pprint.pprint(dag)

    if filename not in input_cache:
        input_cache[filename] = pd.read_csv('data/' + filename, sep=';')

    data = input_cache[filename]

    feats = data[data.columns[:-1]]
    targets = data[data.columns[-1]]

    le = preprocessing.LabelEncoder()

    ix = targets.index
    targets = pd.Series(le.fit_transform(targets), index=ix)

    errors = []

    start_time = time.time()

    for train_idx, test_idx in cross_validation.StratifiedKFold(targets,
                                                                n_folds=5):
        train_data = (feats.iloc[train_idx], targets.iloc[train_idx])
        test_data = (feats.iloc[test_idx], targets.iloc[test_idx])

        ms = train_dag(dag, train_data)
        preds = test_dag(dag, ms, test_data)

        acc = mm.quadratic_weighted_kappa(test_data[1], preds)
        if filename == 'ml-prove.csv':
            acc = metrics.accuracy_score(test_data[1], preds)
        errors.append(acc)

    m_errors = float(np.mean(errors))
    s_errors = float(np.std(errors))

    return m_errors, s_errors, time.time() - start_time
Example #26
0
def error_analysis_for_labeling(instances,
                                X,
                                y,
                                folds,
                                data_folder,
                                clf=svm.LinearSVC(C=0.01)):
    cv = cross_validation.StratifiedKFold(y, n_folds=folds, random_state=0)
    for i, (train, test) in enumerate(cv):
        model = clf.fit(X[train], y[train])
        y_pred = model.predict(X[test])
        scores = model.decision_function(X[test])
        #scores = model.predict_proba(X[test])[:,1]
        #precision, recall, thresholds = precision_recall_curve(y[test], scores)
        #print thresholds.shape[0]
        #for i in range(thresholds.shape[0]):
        #    print "Threshold: %f, Precision: %f, Recall: %f" %(thresholds[i], precision[i], recall[i])

        print("\nROC score on Test Data")
        print roc_auc_score(y[test], scores)
        do_error_analysis(y[test], y_pred, scores, test, instances)
        #relabel(y[test], y_pred, scores, test, instances, data_folder)
        print "\n" * 5
Example #27
0
def Classifier(filename):

    print 'Loading data...'
    id, data, target = readTrainData(filename)

    print 'Total Examples', data.shape[0], 'Dummy percentage', 1 - target.mean(
    )

    accuracy = []

    kf = cross_validation.StratifiedKFold(target, 5)

    print 'Training and Testing...'

    for train, test in kf:
        dataTrain, dataTest, targetTrain, targetTest = data[train], data[
            test], target[train], target[test]

        idTest = id[test]

        clf = BlendedClassifiers()

        clf.fit(dataTrain, targetTrain)

        probs = clf.predict_proba(dataTest)

        metric = PAtK(probs, targetTest, idTest)

        accuracy.append(metric)

        # print clf.predict_proba(dataTest)
        print 'P@K:', metric

    mean = np.mean(accuracy)
    ci = 1.96 * (np.std(accuracy) / np.sqrt(5))

    print 'Mean P@K', mean, 'CI  95%', mean - ci, '-', mean + ci

    return accuracy
Example #28
0
def lr_crossv_getC(trainx, trainy, Carr=[0.1, 1.0, 10.0, 100.0], seed=0):
    ''' Get an appropriate C value for the LR. 
    Carr is the array of C values to test. '''
    
    # Get stratified k folds
    skf = cross_validation.StratifiedKFold(trainy, n_folds=10)

    # Cross-validate for the best C
    best_c = 0
    best_score = 0
    for this_c in Carr:
        lr_est = LogisticRegression(penalty='l1', class_weight='auto', C=this_c, random_state=seed)
        scores = cross_validation.cross_val_score(lr_est, trainx, y=trainy, \
                    scoring='f1', cv=skf)
        # If this this_c scored, on average, better than the best C value so far, update best_c
        this_score = scores.mean()
        print 'This score and C: ', this_score, this_c
        if this_score > best_score:
            best_score = this_score
            best_c = this_c

    return best_c
Example #29
0
 def test(texts, classes, models, nn_params, folds=4):
     '''
         Check the performance on an SVM implementation,
         given a list of texts and their classes (negative/neutral/positive)
         Uses k-fold cross-validation (keeping in mind to divide the data
         appropriately, depending on the class)
     '''
     classes = np.array(classes)
     texts = np.array(texts)
     
     wrongs = []
     auc_sum = 0
     
     for train, test in cross_validation.StratifiedKFold(classes, folds):
         texts_train = texts[train]
         classes_train = classes[train]
         texts_test = texts[test]
         classes_test = classes[test]
         n = Ensemble(texts_train, classes_train, nn_params, models)
         predictions = n.classify(texts_test)
         predictions[predictions<0] = 0
         
         auc = calculate_auc(classes_test, predictions)
         print auc
         auc_sum += auc
         
         for i in range(len(texts_test)):
             if abs(classes_test[i] - predictions[i]) > 0.5:
                 wrongs.append((classes_test[i], predictions[i], texts_test[i]))
         
     '''
     import csv
     writer = open('wrongs.csv', 'w')
     for w in wrongs:
         writer.write('%s,%s,%s\n' % w)
     writer.close()
     '''
     
     return auc_sum / folds
Example #30
0
def kfold(tracks, feature_names, folds=5, shuffle=True, **kwargs):
    labels = [track['label'] for track in tracks]
    kf = cross_validation.StratifiedKFold(labels,
                                          n_folds=folds,
                                          shuffle=shuffle)
    for train, test in kf:
        train_tracks = [tracks[i] for i in train]
        test_tracks = [tracks[i] for i in test]
        clf = machine_learning.Classifier(**kwargs)
        clf = machine_learning.train_tracks(clf, train_tracks, feature_names)
        predicted_all = []
        Y_test_all = []
        for track in test_tracks:
            X_test, Y_test = machine_learning.shape_features([track],
                                                             feature_names)
            predicted = machine_learning.predict(X_test, clf)
            track['sample_predictions'] = predicted
            track['prediction'], track['predictions'] = util.most_common(
                predicted)
            predicted_all.extend(predicted)
            Y_test_all.extend(Y_test)
        yield test_tracks