コード例 #1
0
ファイル: utils.py プロジェクト: ChristianGeng/skll
def make_sparse_data(use_feature_hashing=False):
    """
    Function to create sparse data with two features always zero
    in the training set and a different one always zero in the
    test set
    """
    # Create training data
    X, y = make_classification(n_samples=500, n_features=3,
                               n_informative=3, n_redundant=0,
                               n_classes=2, random_state=1234567890)

    # we need features to be non-negative since we will be
    # using naive bayes laster
    X = np.abs(X)

    # make sure that none of the features are zero
    X[np.where(X == 0)] += 1

    # since we want to use SKLL's FeatureSet class, we need to
    # create a list of IDs
    ids = ['EXAMPLE_{}'.format(n) for n in range(1, 501)]

    # create a list of dictionaries as the features
    # with f1 and f5 always 0
    feature_names = ['f{}'.format(n) for n in range(1, 6)]
    features = []
    for row in X:
        row = [0] + row.tolist() + [0]
        features.append(dict(zip(feature_names, row)))

    # use a FeatureHasher if we are asked to do feature hashing
    vectorizer = FeatureHasher(n_features=4) if use_feature_hashing else None
    train_fs = FeatureSet('train_sparse', ids,
                          features=features, labels=y,
                          vectorizer=vectorizer)

    # now create the test set with f4 always 0 but nothing else
    X, y = make_classification(n_samples=100, n_features=4,
                               n_informative=4, n_redundant=0,
                               n_classes=2, random_state=1234567890)
    X = np.abs(X)
    X[np.where(X == 0)] += 1
    ids = ['EXAMPLE_{}'.format(n) for n in range(1, 101)]

    # create a list of dictionaries as the features
    # with f4 always 0
    feature_names = ['f{}'.format(n) for n in range(1, 6)]
    features = []
    for row in X:
        row = row.tolist()
        row = row[:3] + [0] + row[3:]
        features.append(dict(zip(feature_names, row)))

    test_fs = FeatureSet('test_sparse', ids,
                         features=features, labels=y,
                         vectorizer=vectorizer)

    return train_fs, test_fs
コード例 #2
0
def test_select_kbest_classif():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple classification problem
    # with the k best heuristic
    X, y = make_classification(n_samples=200,
                               n_features=20,
                               n_informative=3,
                               n_redundant=2,
                               n_repeated=0,
                               n_classes=8,
                               n_clusters_per_class=1,
                               flip_y=0.0,
                               class_sep=10,
                               shuffle=False,
                               random_state=0)

    univariate_filter = SelectKBest(f_classif, k=5)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode='k_best',
                                   param=5).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
コード例 #3
0
def test_select_percentile_classif_sparse():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple classification problem
    # with the percentile heuristic
    X, y = make_classification(n_samples=200,
                               n_features=20,
                               n_informative=3,
                               n_redundant=2,
                               n_repeated=0,
                               n_classes=8,
                               n_clusters_per_class=1,
                               flip_y=0.0,
                               class_sep=10,
                               shuffle=False,
                               random_state=0)
    X = sparse.csr_matrix(X)
    univariate_filter = SelectPercentile(f_classif, percentile=25)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode='percentile',
                                   param=25).fit(X, y).transform(X)
    assert_array_equal(X_r.toarray(), X_r2.toarray())
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)

    X_r2inv = univariate_filter.inverse_transform(X_r2)
    assert_true(sparse.issparse(X_r2inv))
    support_mask = safe_mask(X_r2inv, support)
    assert_equal(X_r2inv.shape, X.shape)
    assert_array_equal(X_r2inv[:, support_mask].toarray(), X_r.toarray())
    # Check other columns are empty
    assert_equal(X_r2inv.getnnz(), X_r.getnnz())
コード例 #4
0
def test_linearsvc_parameters():
    """
    Test possible parameter combinations in LinearSVC
    """
    # Generate list of possible parameter combinations
    losses = ['hinge', 'squared_hinge', 'logistic_regression', 'foo']
    penalties, duals = ['l1', 'l2', 'bar'], [True, False]

    X, y = make_classification(n_samples=5, n_features=5)

    for loss, penalty, dual in itertools.product(losses, penalties, duals):
        clf = svm.LinearSVC(penalty=penalty, loss=loss, dual=dual)
        if ((loss, penalty) == ('hinge', 'l1') or
                (loss, penalty, dual) == ('hinge', 'l2', False) or
                (penalty, dual) == ('l1', True) or
                loss == 'foo' or penalty == 'bar'):

            assert_raises_regexp(ValueError,
                                 "Unsupported set of arguments.*penalty='%s.*"
                                 "loss='%s.*dual=%s"
                                 % (penalty, loss, dual),
                                 clf.fit, X, y)
        else:
            clf.fit(X, y)

    # Incorrect loss value - test if explicit error message is raised
    assert_raises_regexp(ValueError, ".*loss='l3' is not supported.*",
                         svm.LinearSVC(loss="l3").fit, X, y)
コード例 #5
0
def test_grid_search_precomputed_kernel_error_kernel_function():
    """Test that grid search returns an error when using a kernel_function"""
    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
    kernel_function = lambda x1, x2: np.dot(x1, x2.T)
    clf = SVC(kernel=kernel_function)
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
    assert_raises(ValueError, cv.fit, X_, y_)
コード例 #6
0
def test_f_classif_multi_class():
    """
    Test whether the F test yields meaningful results
    on a simple simulated classification problem
    """
    X, Y = make_classification(
        n_samples=200,
        n_features=20,
        n_informative=3,
        n_redundant=2,
        n_repeated=0,
        n_classes=8,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )

    F, pv = f_classif(X, Y)
    assert (F > 0).all()
    assert (pv > 0).all()
    assert (pv < 1).all()
    assert (pv[:5] < 0.05).all()
    assert (pv[5:] > 1.0e-5).all()
コード例 #7
0
ファイル: test_featureset.py プロジェクト: monkidea/skll
def test_mismatch_labels_features():
    """
    Test to catch mistmatch between the shape of the labels vector and the feature matrix
    """

    # get a 100 instances with 4 features but ignore the labels we
    # get from here
    X, y = make_classification(n_samples=100, n_features=4,
                               n_informative=4, n_redundant=0,
                               n_classes=3, random_state=1234567890)

    # double-stack y to ensure we don't match the number of feature rows
    y2 = np.hstack([y, y])

    # convert the features into a list of dictionaries
    feature_names = ['f{}'.format(n) for n in range(1, 5)]
    features = []
    for row in X:
        features.append(dict(zip(feature_names, row)))

    # get 100 ids
    ids = ['EXAMPLE_{}'.format(i) for i in range(100)]

    # This should raise a ValueError
    FeatureSet('test', ids, features=features, labels=y2)
コード例 #8
0
def test_grid_search_precomputed_kernel_error_kernel_function():
    """Test that grid search returns an error when using a kernel_function"""
    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
    kernel_function = lambda x1, x2: np.dot(x1, x2.T)
    clf = SVC(kernel=kernel_function)
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
    assert_raises(ValueError, cv.fit, X_, y_)
コード例 #9
0
ファイル: test_featureset.py プロジェクト: BK-University/skll
def test_mismatch_labels_features():
    """
    Test to catch mistmatch between the shape of the labels vector and the feature matrix
    """

    # get a 100 instances with 4 features but ignore the labels we
    # get from here
    X, y = make_classification(n_samples=100, n_features=4,
                               n_informative=4, n_redundant=0,
                               n_classes=3, random_state=1234567890)

    # double-stack y to ensure we don't match the number of feature rows
    y2 = np.hstack([y, y])

    # convert the features into a list of dictionaries
    feature_names = ['f{}'.format(n) for n in range(1, 5)]
    features = []
    for row in X:
        features.append(dict(zip(feature_names, row)))

    # get 100 ids
    ids = ['EXAMPLE_{}'.format(i) for i in range(100)]

    # This should raise a ValueError
    FeatureSet('test', ids, features=features, labels=y2)
コード例 #10
0
def make_scaling_data(use_feature_hashing=False):

    X, y = make_classification(n_samples=1000, n_classes=2,
                               n_features=5, n_informative=5,
                               n_redundant=0, random_state=1234567890)

    # we want to arbitrary scale the various features to test the scaling
    scalers = np.array([1, 10, 100, 1000, 10000])
    X = X * scalers

    # since we want to use SKLL's FeatureSet class, we need to
    # create a list of IDs
    ids = ['EXAMPLE_{}'.format(n) for n in range(1, 1001)]

    # create a list of dictionaries as the features
    feature_names = ['f{}'.format(n) for n in range(1, 6)]
    features = []
    for row in X:
        features.append(dict(zip(feature_names, row)))

    # split everything into training and testing portions
    train_features, test_features = features[:800], features[800:]
    train_y, test_y = y[:800], y[800:]
    train_ids, test_ids = ids[:800], ids[800:]

    vectorizer = FeatureHasher(n_features=4) if use_feature_hashing else None
    train_fs = FeatureSet('train_scaling', train_ids,
                          features=train_features, labels=train_y,
                          vectorizer=vectorizer)
    test_fs = FeatureSet('test_scaling', test_ids,
                         features=test_features, labels=test_y,
                         vectorizer=vectorizer)

    return (train_fs, test_fs)
コード例 #11
0
def test_select_kbest_classif():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple classification problem
    # with the k best heuristic
    X, y = make_classification(
        n_samples=200,
        n_features=20,
        n_informative=3,
        n_redundant=2,
        n_repeated=0,
        n_classes=8,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )

    univariate_filter = SelectKBest(f_classif, k=5)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode="k_best", param=5).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
コード例 #12
0
def test_deprecated_score_func():
    # test that old deprecated way of passing a score / loss function is still
    # supported
    X, y = make_classification(n_samples=200, n_features=100, random_state=0)
    clf = LinearSVC(random_state=0)
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1")
    cv.fit(X[:180], y[:180])
    y_pred = cv.predict(X[180:])
    C = cv.best_estimator_.C

    clf = LinearSVC(random_state=0)
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score)
    with warnings.catch_warnings(record=True):
        # catch deprecation warning
        cv.fit(X[:180], y[:180])
    y_pred_func = cv.predict(X[180:])
    C_func = cv.best_estimator_.C

    assert_array_equal(y_pred, y_pred_func)
    assert_equal(C, C_func)

    # test loss where greater is worse
    def f1_loss(y_true_, y_pred_):
        return -f1_score(y_true_, y_pred_)

    clf = LinearSVC(random_state=0)
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, loss_func=f1_loss)
    with warnings.catch_warnings(record=True):
        # catch deprecation warning
        cv.fit(X[:180], y[:180])
    y_pred_loss = cv.predict(X[180:])
    C_loss = cv.best_estimator_.C

    assert_array_equal(y_pred, y_pred_loss)
    assert_equal(C, C_loss)
コード例 #13
0
def test_select_heuristics_classif():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple classification problem
    # with the fdr, fwe and fpr heuristics
    X, y = make_classification(
        n_samples=200,
        n_features=20,
        n_informative=3,
        n_redundant=2,
        n_repeated=0,
        n_classes=8,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )

    univariate_filter = SelectFwe(f_classif, alpha=0.01)
    X_r = univariate_filter.fit(X, y).transform(X)
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    for mode in ["fdr", "fpr", "fwe"]:
        X_r2 = GenericUnivariateSelect(f_classif, mode=mode, param=0.01).fit(X, y).transform(X)
        assert_array_equal(X_r, X_r2)
        support = univariate_filter.get_support()
        assert_array_almost_equal(support, gtruth)
コード例 #14
0
def test_select_kbest_all():
    # Test whether k="all" correctly returns all features.
    X, y = make_classification(n_samples=20, n_features=10, shuffle=False, random_state=0)

    univariate_filter = SelectKBest(f_classif, k="all")
    X_r = univariate_filter.fit(X, y).transform(X)
    assert_array_equal(X, X_r)
コード例 #15
0
ファイル: test_svm.py プロジェクト: nateyoder/scikit-learn
def test_linearsvc_parameters():
    """
    Test possible parameter combinations in LinearSVC
    """
    # Generate list of possible parameter combinations
    losses = ['hinge', 'squared_hinge', 'logistic_regression', 'foo']
    penalties, duals = ['l1', 'l2', 'bar'], [True, False]

    X, y = make_classification(n_samples=5, n_features=5)

    for loss, penalty, dual in itertools.product(losses, penalties, duals):
        clf = svm.LinearSVC(penalty=penalty, loss=loss, dual=dual)
        if ((loss, penalty) == ('hinge', 'l1') or
                (loss, penalty, dual) == ('hinge', 'l2', False) or
                (penalty, dual) == ('l1', True) or
                loss == 'foo' or penalty == 'bar'):

            assert_raises_regexp(ValueError,
                                 "Unsupported set of arguments.*penalty='%s.*"
                                 "loss='%s.*dual=%s"
                                 % (penalty, loss, dual),
                                 clf.fit, X, y)
        else:
            clf.fit(X, y)

    # Incorrect loss value - test if explicit error message is raised
    assert_raises_regexp(ValueError, ".*loss='L3' is not supported.*",
                         svm.LinearSVC(loss="L3").fit, X, y)
コード例 #16
0
def test_mutual_info_classif():
    X, y = make_classification(
        n_samples=100,
        n_features=5,
        n_informative=1,
        n_redundant=1,
        n_repeated=0,
        n_classes=2,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )

    # Test in KBest mode.
    univariate_filter = SelectKBest(mutual_info_classif, k=2)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(mutual_info_classif, mode="k_best", param=2).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(5)
    gtruth[:2] = 1
    assert_array_equal(support, gtruth)

    # Test in Percentile mode.
    univariate_filter = SelectPercentile(mutual_info_classif, percentile=40)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(mutual_info_classif, mode="percentile", param=40).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(5)
    gtruth[:2] = 1
    assert_array_equal(support, gtruth)
コード例 #17
0
def test_f_classif():
    # Test whether the F test yields meaningful results
    # on a simple simulated classification problem
    X, y = make_classification(
        n_samples=200,
        n_features=20,
        n_informative=3,
        n_redundant=2,
        n_repeated=0,
        n_classes=8,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )

    F, pv = f_classif(X, y)
    F_sparse, pv_sparse = f_classif(sparse.csr_matrix(X), y)
    assert_true((F > 0).all())
    assert_true((pv > 0).all())
    assert_true((pv < 1).all())
    assert_true((pv[:5] < 0.05).all())
    assert_true((pv[5:] > 1.0e-4).all())
    assert_array_almost_equal(F_sparse, F)
    assert_array_almost_equal(pv_sparse, pv)
コード例 #18
0
def test_grid_search_precomputed_kernel():
    """Test that grid search works when the input features are given in the
    form of a precomputed kernel matrix """
    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)

    # compute the training kernel matrix corresponding to the linear kernel
    K_train = np.dot(X_[:180], X_[:180].T)
    y_train = y_[:180]

    clf = SVC(kernel='precomputed')
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
    cv.fit(K_train, y_train)

    assert_true(cv.best_score_ >= 0)

    # compute the test kernel matrix
    K_test = np.dot(X_[180:], X_[:180].T)
    y_test = y_[180:]

    y_pred = cv.predict(K_test)

    assert_true(np.mean(y_pred == y_test) >= 0)

    # test error is raised when the precomputed kernel is not array-like
    # or sparse
    assert_raises(ValueError, cv.fit, K_train.tolist(), y_train)
コード例 #19
0
def test_select_percentile_classif_sparse():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple classification problem
    # with the percentile heuristic
    X, y = make_classification(
        n_samples=200,
        n_features=20,
        n_informative=3,
        n_redundant=2,
        n_repeated=0,
        n_classes=8,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )
    X = sparse.csr_matrix(X)
    univariate_filter = SelectPercentile(f_classif, percentile=25)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode="percentile", param=25).fit(X, y).transform(X)
    assert_array_equal(X_r.toarray(), X_r2.toarray())
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)

    X_r2inv = univariate_filter.inverse_transform(X_r2)
    assert_true(sparse.issparse(X_r2inv))
    support_mask = safe_mask(X_r2inv, support)
    assert_equal(X_r2inv.shape, X.shape)
    assert_array_equal(X_r2inv[:, support_mask].toarray(), X_r.toarray())
    # Check other columns are empty
    assert_equal(X_r2inv.getnnz(), X_r.getnnz())
コード例 #20
0
def test_select_fwe_classif():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple classification problem
    with the fpr heuristic
    """
    X, Y = make_classification(
        n_samples=200,
        n_features=20,
        n_informative=3,
        n_redundant=2,
        n_repeated=0,
        n_classes=8,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )

    univariate_filter = SelectFwe(f_classif, alpha=0.01)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode="fwe", param=0.01).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert np.sum(np.abs(support - gtruth)) < 2
コード例 #21
0
def test_grid_search_sparse_scoring():
    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)

    clf = LinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1")
    cv.fit(X_[:180], y_[:180])
    y_pred = cv.predict(X_[180:])
    C = cv.best_estimator_.C

    X_ = sp.csr_matrix(X_)
    clf = LinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1")
    cv.fit(X_[:180], y_[:180])
    y_pred2 = cv.predict(X_[180:])
    C2 = cv.best_estimator_.C

    assert_array_equal(y_pred, y_pred2)
    assert_equal(C, C2)
    # Smoke test the score
    #np.testing.assert_allclose(f1_score(cv.predict(X_[:180]), y[:180]),
    #                        cv.score(X_[:180], y[:180]))

    # test loss where greater is worse
    def f1_loss(y_true_, y_pred_):
        return -f1_score(y_true_, y_pred_)
    F1Loss = Scorer(f1_loss, greater_is_better=False)
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring=F1Loss)
    cv.fit(X_[:180], y_[:180])
    y_pred3 = cv.predict(X_[180:])
    C3 = cv.best_estimator_.C

    assert_equal(C, C3)
    assert_array_equal(y_pred, y_pred3)
コード例 #22
0
def featureset_creation_from_dataframe_helper(with_labels, use_feature_hasher):
    """
    Helper function for the two unit tests for FeatureSet.from_data_frame().
    Since labels are optional, run two tests, one with, one without.
    """

    # First, setup the test data.
    # get a 100 instances with 4 features each
    X, y = make_classification(n_samples=100,
                               n_features=4,
                               n_informative=4,
                               n_redundant=0,
                               n_classes=3,
                               random_state=1234567890)

    # Not using 0 - 100 here because that would be pandas' default index names anyway.
    # So let's make sure pandas is using the ids we supply.
    ids = list(range(100, 200))

    featureset_name = 'test'

    # if use_feature_hashing, run these tests with a vectorizer
    feature_bins = 4
    vectorizer = (FeatureHasher(
        n_features=feature_bins) if use_feature_hasher else None)

    # convert the features into a list of dictionaries
    feature_names = ['f{}'.format(n) for n in range(1, 5)]
    features = []
    for row in X:
        features.append(dict(zip(feature_names, row)))

    # Now, create a FeatureSet object.
    if with_labels:
        expected = FeatureSet(featureset_name,
                              ids,
                              features=features,
                              labels=y,
                              vectorizer=vectorizer)
    else:
        expected = FeatureSet(featureset_name,
                              ids,
                              features=features,
                              vectorizer=vectorizer)

    # Also create a DataFrame and then create a FeatureSet from it.
    df = pd.DataFrame(features, index=ids)
    if with_labels:
        df['y'] = y
        current = FeatureSet.from_data_frame(df,
                                             featureset_name,
                                             labels_column='y',
                                             vectorizer=vectorizer)
    else:
        current = FeatureSet.from_data_frame(df,
                                             featureset_name,
                                             vectorizer=vectorizer)

    return (expected, current)
コード例 #23
0
def test_randomized_search():
    # very basic smoke test
    X, y = make_classification(n_samples=200, n_features=100, random_state=0)

    params = dict(C=distributions.expon())
    search = RandomizedSearchCV(LinearSVC(), param_distributions=params)
    search.fit(X, y)
    assert_equal(len(search.cv_scores_), 10)
コード例 #24
0
ファイル: test_svm.py プロジェクト: bearrito/scikit-learn
def test_crammer_singer_binary():
    """Test Crammer-Singer formulation in the binary case"""
    X, y = make_classification(n_classes=2, random_state=0)
    acc = svm.LinearSVC(random_state=0).fit(X, y).score(X, y)
    acc2 = svm.LinearSVC(multi_class="crammer_singer",
                         random_state=0).fit(X, y).score(X, y)
    assert_almost_equal(acc, 0.66)
    assert_almost_equal(acc2, 0.68)
コード例 #25
0
def test_grid_search_error():
    """Test that grid search will capture errors on data with different
    length"""
    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)

    clf = LinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
    assert_raises(ValueError, cv.fit, X_[:180], y_)
コード例 #26
0
def test_select_kbest_all():
    # Test whether k="all" correctly returns all features.
    X, y = make_classification(n_samples=20, n_features=10,
                               shuffle=False, random_state=0)

    univariate_filter = SelectKBest(f_classif, k='all')
    X_r = univariate_filter.fit(X, y).transform(X)
    assert_array_equal(X, X_r)
コード例 #27
0
def test_grid_search_error():
    """Test that grid search will capture errors on data with different
    length"""
    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)

    clf = LinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
    assert_raises(ValueError, cv.fit, X_[:180], y_)
コード例 #28
0
def test_randomized_search():
    # very basic smoke test
    X, y = make_classification(n_samples=200, n_features=100, random_state=0)

    params = dict(C=distributions.expon())
    search = RandomizedSearchCV(LinearSVC(), param_distributions=params)
    search.fit(X, y)
    assert_equal(len(search.cv_scores_), 10)
コード例 #29
0
def generate_random_dataset(data_type, amount=500, features=10):

    _valid_datatype = {
        'Normal Distribution Classified':
        make_classification(n_features=features, )
    }

    return None
コード例 #30
0
def test_on_synthetic_data():
	X, _ = make_classification(n_samples=10000,n_features=2, n_redundant=0, weights=[0.99,0.01])

	for i in [1000,2000,3000,4000,5000,6000,7000,8000,9000]:



		labels, cluster_novelty = novelty_detector(X[:i], X[i:i+1000])
コード例 #31
0
def loadData():

    # generating data
    X, y = samples_generator.make_classification(n_features=20, \
                                                 n_informative=3, \
                                                 n_redundant=0, \
                                                 n_classes=4, \
                                                 n_clusters_per_class=2)
    return X, y
コード例 #32
0
def test_gradient_boosting_estimator_with_binomial_deviance_loss():
    np.random.seed(0)
    X, y = make_classification(n_classes=2)
    loss_function = BinomialDeviance(2)
    model = Booster(Earth(max_degree=2, use_fast=True, max_terms=10), loss_function)
    model.fit(X, y)
    assert_greater(np.sum(model.predict(X)==y) / float(y.shape[0]), .90)
    assert_true(np.all(0<=model.predict_proba(X)))
    assert_true(np.all(1>=model.predict_proba(X)))
コード例 #33
0
def load_blob(classes=3,
              features=10,
              samples=10,
              random_state=0,
              noise_dims=0,
              noise_scale=1.0,
              noise_pattern=None,
              clusters=1):
    """
    Creates a Gaussian blob classification dataset of given parameters   

    Parameters
    ----------
    classes : num classes
    features :  num columns
    samples:  num rows
    random_state:  random seed
    noise_dims: parameters for adding redundant dims
    noise_scale: //
    noise_pattern: //
    clusters: num blobs for each class
    Returns
    -------
    Noise added standardized/normalized datasets dict
    separating train test samples and labels
    """
    #samples = int(samples / .7)  # 30% for test
    print("values:", features, classes, clusters)
    xs, ys = make_classification(n_samples=samples,
                                 n_features=features,
                                 n_informative=features,
                                 n_redundant=0,
                                 n_repeated=0,
                                 n_classes=classes,
                                 n_clusters_per_class=clusters)

    ys = np.float32(ys)

    if noise_dims > 0:
        #print("here",noise_dims)
        xs = add_noisy_dims(xs, noise_dims, noise_scale, noise_pattern)

    X_train, X_test, y_train, y_test = train_test_split(
        xs, ys.squeeze(), test_size=0.3, random_state=random_state)

    # now normalize all
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    return dict(X_train=X_train,
                y_train=y_train,
                X_test=X_test,
                y_test=y_test,
                num_examples_train=X_train.shape[0],
                num_examples_test=X_test.shape[0])
コード例 #34
0
ファイル: test_svm.py プロジェクト: polkapolka/scikit-learn
def test_crammer_singer_binary():
    """Test Crammer-Singer formulation in the binary case"""
    X, y = make_classification(n_classes=2, random_state=0)

    for fit_intercept in (True, False):
        acc = svm.LinearSVC(fit_intercept=fit_intercept,
                            multi_class="crammer_singer",
                            random_state=0).fit(X, y).score(X, y)
        assert_greater(acc, 0.9)
コード例 #35
0
ファイル: test_svm.py プロジェクト: nateyoder/scikit-learn
def test_crammer_singer_binary():
    """Test Crammer-Singer formulation in the binary case"""
    X, y = make_classification(n_classes=2, random_state=0)

    for fit_intercept in (True, False):
        acc = svm.LinearSVC(fit_intercept=fit_intercept,
                            multi_class="crammer_singer",
                            random_state=0).fit(X, y).score(X, y)
        assert_greater(acc, 0.9)
コード例 #36
0
def genRandomData(numVariables, numMuestras):
    """ Function documentation
    """
    from sklearn.datasets import samples_generator
    X, y = samples_generator.make_classification(n_samples=numMuestras,
                                                 n_features=numVariables,
                                                 n_informative=2,
                                                 n_redundant=0,
                                                 random_state=77)
    return X, y
コード例 #37
0
def test_sklearn2code_export():
    np.random.seed(0)
    X, y = make_classification(n_classes=2)
    X = DataFrame(X, columns=['x%d' % i for i in range(X.shape[1])])
    loss_function = BinomialDeviance(2)
    model = Booster(Earth(max_degree=2, use_fast=True, max_terms=10), loss_function)
    model.fit(X, y)
    code = sklearn2code(model, ['predict', 'predict_proba', 'transform'], numpy_flat)
    module = exec_module('test_module', code)
    assert_correct_exported_module(model, module, ['predict', 'predict_proba', 'transform'], dict(X=X), X)
コード例 #38
0
 def classification(self):
     from sklearn.datasets.samples_generator import make_classification
     # X1为样本特征,Y1为样本类别输出, 共400个样本,每个样本2个特征,输出有3个类别,没有冗余特征,每个类别一个簇
     X1, Y1 = make_classification(n_samples=400,
                                  n_features=2,
                                  n_redundant=0,
                                  n_clusters_per_class=1,
                                  n_classes=3)
     plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1)
     plt.show()
コード例 #39
0
ファイル: test_cv.py プロジェクト: monkidea/skll
def make_cv_folds_data(num_examples_per_fold=100,
                       num_folds=3,
                       use_feature_hashing=False):
    """
    Create data for pre-specified CV folds tests
    with or without feature hashing
    """

    num_total_examples = num_examples_per_fold * num_folds

    # create the numeric features and the binary labels
    X, _ = make_classification(n_samples=num_total_examples,
                               n_features=3,
                               n_informative=3,
                               n_redundant=0,
                               n_classes=2,
                               random_state=1234567890)
    y = np.array([0, 1] * int(num_total_examples / 2))

    # the folds mapping: the first num_examples_per_fold examples
    # are in fold 1 the second num_examples_per_fold are in
    # fold 2 and so on
    foldgen = ([str(i)] * num_examples_per_fold for i in range(num_folds))
    folds = list(itertools.chain(*foldgen))

    # now create the list of feature dictionaries
    # and add the binary features that depend on
    # the class and fold number
    feature_names = ['f{}'.format(i) for i in range(1, 4)]
    features = []
    for row, classid, foldnum in zip(X, y, folds):
        string_feature_name = 'is_{}_{}'.format(classid, foldnum)
        string_feature_value = 1
        feat_dict = dict(zip(feature_names, row))
        feat_dict.update({string_feature_name: string_feature_value})
        features.append(feat_dict)

    # create the example IDs
    ids = [
        'EXAMPLE_{}'.format(num_examples_per_fold * k + i)
        for k in range(num_folds) for i in range(num_examples_per_fold)
    ]

    # create the cross-validation feature set with or without feature hashing
    vectorizer = FeatureHasher(n_features=4) if use_feature_hashing else None
    cv_fs = FeatureSet('cv_folds',
                       ids,
                       features=features,
                       labels=y,
                       vectorizer=vectorizer)

    # make the custom cv folds dictionary
    custom_cv_folds = dict(zip(ids, folds))

    return (cv_fs, custom_cv_folds)
コード例 #40
0
 def prepare_data(self):
     data, target = make_classification(n_samples=200,
                                        n_features=2,
                                        n_redundant=0,
                                        n_informative=2,
                                        n_classes=2)
     X, y = shuffle(data, target, random_state=42)
     X = X.astype(np.float32)
     y = y.reshape(-1, 1)
     data = np.concatenate((X, y), axis=1)
     return data
コード例 #41
0
def toydata2(rng):
    from sklearn.datasets import samples_generator

    n_samples = 1000
    n_features = 2
    n_classes = 2
    n_informative = 2
    n_clusters_per_class = int((2**n_informative) // n_classes)
    hypercube = False
    samplekw = dict(
        flip_y=0.00,
        class_sep=1.0,
        shift=[-10, 10],
        scale=1.0,
        n_redundant=0,
        n_repeated=0,
        hypercube=hypercube,
        n_samples=n_samples,
        n_informative=n_informative,
        n_classes=n_classes,
        n_clusters_per_class=n_clusters_per_class,
        weights=None,
        shuffle=True,
        n_features=n_features,
        random_state=rng,
    )

    X_true, y = samples_generator.make_classification(**samplekw)
    with_extra = ut.get_argflag('--extra')
    # make very informative nan dimension
    if with_extra:
        n_informative_nan = 100
        # extra_x = (rng.randn(n_informative_nan, 2) / 2 + [[12, -8]])
        extra_x = rng.randn(n_informative_nan, 2) / 2 + [[10, -12]]
        X_true = np.vstack((X_true, extra_x))
        y = np.append(y, [0] * n_informative_nan)

    # Randomly drop datapoints
    X = X_true.copy()
    nanrate = ut.get_argval('--nanrate', default=0.01)
    if nanrate:
        # TODO:
        # * informative nan
        # * random nan
        # * random nan + informative nan
        X.ravel()[rng.rand(X.size) < nanrate] = np.nan

    if with_extra:
        if True:
            X.T[1][-n_informative_nan:] = np.nan
        else:
            X.T[0][-n_informative_nan:-n_informative_nan // 2] = np.nan
            X.T[1][-n_informative_nan // 2:] = np.nan
    return X_true, X, y
コード例 #42
0
def test_weight():
    """
    Test class weights
    """
    X_, y_ = make_classification(n_samples=200, n_features=100, weights=[0.833, 0.167], random_state=0)

    X_ = sparse.csr_matrix(X_)
    for clf in (linear_model.LogisticRegression(C=180), svm.LinearSVC(C=180), svm.SVC(C=180)):
        clf.fit(X_[:180], y_[:180], class_weight={0: 5})
        y_pred = clf.predict(X_[180:])
        assert_true(np.sum(y_pred == y_[180:]) >= 11)
コード例 #43
0
def test_select_kbest_zero():
    # Test whether k=0 correctly returns no features.
    X, y = make_classification(n_samples=20, n_features=10, shuffle=False, random_state=0)

    univariate_filter = SelectKBest(f_classif, k=0)
    univariate_filter.fit(X, y)
    support = univariate_filter.get_support()
    gtruth = np.zeros(10, dtype=bool)
    assert_array_equal(support, gtruth)
    X_selected = assert_warns_message(UserWarning, "No features were selected", univariate_filter.transform, X)
    assert_equal(X_selected.shape, (20, 0))
コード例 #44
0
 def create_data(self):
     X, labels = make_classification(n_samples=100,
                                     n_features=2,
                                     n_redundant=0,
                                     n_informative=2,
                                     random_state=1,
                                     n_clusters_per_class=2)
     labels = labels.reshape((-1, 1))
     offset = int(X.shape[0] * 0.9)
     X_train, y_train = X[:offset], labels[:offset]
     X_test, y_test = X[offset:], labels[offset:]
     return X_train, y_train, X_test, y_test
コード例 #45
0
ファイル: SVCJSTest.py プロジェクト: jakirkham/sklearn-porter
 def test_pipeline_estimator(self):
     self.X, self.y = samples_generator.make_classification(
         n_informative=5, n_redundant=0, random_state=42)
     anova_filter = SelectKBest(f_regression, k=5)
     self.mdl = Pipeline([('anova', anova_filter), ('svc', SVC(kernel='linear'))])
     self.mdl.set_params(anova__k=10, svc__C=.1)
     try:
         self._port_model()
     except Exception as e:
         self.fail('Unexpected exception raised: {}'.format(e.message))
     finally:
         self._clear_model()
コード例 #46
0
def test_select_kbest_zero():
    """
    Test whether k=0 correctly returns no features.
    """
    X, y = make_classification(n_samples=20, n_features=10,
                               shuffle=False, random_state=0)

    univariate_filter = SelectKBest(f_classif, k=0)
    univariate_filter.fit(X, y).transform(X)
    support = univariate_filter.get_support()
    gtruth = np.zeros(10, dtype=bool)
    assert_array_equal(support, gtruth)
コード例 #47
0
ファイル: paramsel.py プロジェクト: orazaro/kgml
def make_skewed_data(n_samples=5000,n_features=20,n_classes=2):
    from sklearn.datasets.samples_generator import (make_classification, make_regression)
    X, y = make_classification(n_samples=n_samples, n_features=n_features, n_classes=2,
        n_clusters_per_class=2, n_informative=8, n_redundant=2,
        random_state=1)
    # create unbalamced classes
    plus = np.where(y>0)[0]
    minus = np.where(y<=0)[0]
    plus_sel = random.sample(plus,int(len(plus)/25))
    sel = np.r_[minus,plus_sel]
    np.sort(sel)
    return X[sel,:],y[sel]
コード例 #48
0
def test_select_kbest_zero():
    """
    Test whether k=0 correctly returns no features.
    """
    X, y = make_classification(n_samples=20, n_features=10,
                               shuffle=False, random_state=0)

    univariate_filter = SelectKBest(f_classif, k=0)
    univariate_filter.fit(X, y).transform(X)
    support = univariate_filter.get_support()
    gtruth = np.zeros(10, dtype=bool)
    assert_array_equal(support, gtruth)
コード例 #49
0
def test_grid_search_one_grid_point():
    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
    param_dict = {"C": [1.0], "kernel": ["rbf"], "gamma": [0.1]}

    clf = SVC()
    cv = GridSearchCV(clf, param_dict)
    cv.fit(X_, y_)

    clf = SVC(C=1.0, kernel="rbf", gamma=0.1)
    clf.fit(X_, y_)

    assert_array_equal(clf.dual_coef_, cv.best_estimator_.dual_coef_)
コード例 #50
0
ファイル: classifier.py プロジェクト: orazaro/kgml
def make_skewed_data(random_state=None):
    X, y = make_classification(
        n_samples=5000, n_features=20, n_classes=2,
        n_clusters_per_class=2, n_informative=8, n_redundant=2,
        random_state=random_state)
    # create unbalamced classes
    plus = np.where(y > 0)[0]
    minus = np.where(y <= 0)[0]
    plus_sel = random.sample(plus, int(len(plus) / 25))
    sel = np.r_[minus, plus_sel]
    np.sort(sel)
    return X[sel, :], y[sel]
コード例 #51
0
def test_grid_search_one_grid_point():
    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
    param_dict = {"C": [1.0], "kernel": ["rbf"], "gamma": [0.1]}

    clf = SVC()
    cv = GridSearchCV(clf, param_dict)
    cv.fit(X_, y_)

    clf = SVC(C=1.0, kernel="rbf", gamma=0.1)
    clf.fit(X_, y_)

    assert_array_equal(clf.dual_coef_, cv.best_estimator_.dual_coef_)
コード例 #52
0
def make_dataset(dataset, n_rows, n_cols, n_classes=2):
    np.random.seed(137)
    if dataset == 'classification1':
        X, y = make_classification(
            n_rows, n_cols, n_informative=2, n_redundant=0,
            n_classes=n_classes, n_clusters_per_class=1)
    elif dataset == 'classification2':
        X, y = make_classification(
            n_rows, n_cols, n_informative=2, n_redundant=0,
            n_classes=n_classes, n_clusters_per_class=2)
    elif dataset == 'gaussian':
        X, y = make_gaussian_quantiles(n_samples=n_rows, n_features=n_cols,
                                       n_classes=n_classes)
    elif dataset == 'blobs':
        X, y = make_blobs(n_samples=n_rows, n_features=n_cols,
                          centers=n_classes)
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    # correct case when not all classes made it into the training set
    if np.unique(y_train).size < n_classes:
        for i in range(n_classes):
            y_train[i] = i
    return X_train, X_test, y_train, y_test
コード例 #53
0
def test_select_kbest_zero():
    # Test whether k=0 correctly returns no features.
    X, y = make_classification(n_samples=20, n_features=10,
                               shuffle=False, random_state=0)

    univariate_filter = SelectKBest(f_classif, k=0)
    univariate_filter.fit(X, y)
    support = univariate_filter.get_support()
    gtruth = np.zeros(10, dtype=bool)
    assert_array_equal(support, gtruth)
    X_selected = assert_warns_message(UserWarning, 'No features were selected',
                                      univariate_filter.transform, X)
    assert_equal(X_selected.shape, (20, 0))
コード例 #54
0
ファイル: cache.py プロジェクト: yuyuz/FLASH
def main():
    from sklearn import svm
    from sklearn.datasets import samples_generator
    from sklearn.feature_selection import SelectKBest
    from sklearn.feature_selection import f_regression
    from sklearn.preprocessing import MinMaxScaler

    X, y = samples_generator.make_classification(n_samples=1000, n_informative=5, n_redundant=4, random_state=_random_state)
    anova_filter = SelectKBest(f_regression, k=5)
    scaler = MinMaxScaler()
    clf = svm.SVC(kernel='linear')

    steps = [scaler, anova_filter, clf]
    cached_run(steps, X, y)
コード例 #55
0
def test_f_classif_multi_class():
    # Test whether the F test yields meaningful results
    # on a simple simulated classification problem
    X, y = make_classification(n_samples=200, n_features=20,
                               n_informative=3, n_redundant=2,
                               n_repeated=0, n_classes=8,
                               n_clusters_per_class=1, flip_y=0.0,
                               class_sep=10, shuffle=False, random_state=0)

    F, pv = f_classif(X, y)
    assert (F > 0).all()
    assert (pv > 0).all()
    assert (pv < 1).all()
    assert (pv[:5] < 0.05).all()
    assert (pv[5:] > 1.e-4).all()
コード例 #56
0
 def prepare_data(self):
     # 数据准备
     # 生成分类数据,其中n_samples:生成的数据个数, n_features为生成的数据的特征数, n_classes为数据类别数
     # n_features = n_redundant +  n_informative + n_repeated,
     # n_clusters_per_class * n_classes ≤ 2 * n_informative
     data, target = make_classification(n_samples=200,
                                        n_features=2,
                                        n_redundant=0,
                                        n_informative=2,
                                        n_classes=2)
     X, y = shuffle(data, target, random_state=42)
     X = X.astype(np.float32)
     y = y.reshape(-1, 1)
     data = np.concatenate((X, y), axis=1)
     return data
コード例 #57
0
ファイル: test.py プロジェクト: dmoliveira/malss
def test_classification_2classes_big():
    X, y = make_classification(n_samples=200000,
                               n_features=20,
                               n_classes=2,
                               n_informative=3,
                               weights=[0.7, 0.3],
                               random_state=0)
    X = pd.DataFrame(X)
    y = pd.Series(y)
    cls = MALSS(X, y, 'classification', n_jobs=3)
    cls.execute()
    # cls.make_report('test_classification_2classes_big')

    assert len(cls.algorithms) == 1
    assert cls.algorithms[0].best_score is not None
コード例 #58
0
def test_weight():
    """
    Test class weights
    """
    X_, y_ = make_classification(n_samples=200,
                                 n_features=100,
                                 weights=[0.833, 0.167],
                                 random_state=0)

    X_ = sparse.csr_matrix(X_)
    for clf in (linear_model.LogisticRegression(), svm.LinearSVC(), svm.SVC()):
        clf.set_params(class_weight={0: 5})
        clf.fit(X_[:180], y_[:180])
        y_pred = clf.predict(X_[180:])
        assert_true(np.sum(y_pred == y_[180:]) >= 11)
コード例 #59
0
ファイル: test_featureset.py プロジェクト: BK-University/skll
def featureset_creation_from_dataframe_helper(with_labels, use_feature_hasher):
    """
    Helper function for the two unit tests for FeatureSet.from_data_frame().
    Since labels are optional, run two tests, one with, one without.
    """
    import pandas

    # First, setup the test data.
    # get a 100 instances with 4 features each
    X, y = make_classification(n_samples=100, n_features=4,
                               n_informative=4, n_redundant=0,
                               n_classes=3, random_state=1234567890)

    # Not using 0 - 100 here because that would be pandas' default index names anyway.
    # So let's make sure pandas is using the ids we supply.
    ids = list(range(100, 200))

    featureset_name = 'test'

    # if use_feature_hashing, run these tests with a vectorizer
    feature_bins = 4
    vectorizer = (FeatureHasher(n_features=feature_bins)
                  if use_feature_hasher else None)
    
    # convert the features into a list of dictionaries
    feature_names = ['f{}'.format(n) for n in range(1, 5)]
    features = []
    for row in X:
        features.append(dict(zip(feature_names, row)))

    # Now, create a FeatureSet object.
    if with_labels:
        expected = FeatureSet(featureset_name, ids, features=features, labels=y,
                              vectorizer=vectorizer)
    else:
        expected = FeatureSet(featureset_name, ids, features=features,
                              vectorizer=vectorizer)

    # Also create a DataFrame and then create a FeatureSet from it.
    df = pandas.DataFrame(features, index=ids)
    if with_labels:
        df['y'] = y
        current = FeatureSet.from_data_frame(df, featureset_name, labels_column='y',
                                             vectorizer=vectorizer)
    else:
        current = FeatureSet.from_data_frame(df, featureset_name, vectorizer=vectorizer)

    return (expected, current)