Beispiel #1
0
def _daal_type_of_target(y):
    valid = ((isinstance(y, (Sequence, spmatrix)) or hasattr(y, '__array__'))
             and not isinstance(y, str))

    if not valid:
        raise ValueError('Expected array-like (array or non-string sequence), '
                         'got %r' % y)

    sparse_pandas = (y.__class__.__name__ in ['SparseSeries', 'SparseArray'])
    if sparse_pandas:
        raise ValueError("y cannot be class 'SparseSeries' or 'SparseArray'")

    if is_multilabel(y):
        return 'multilabel-indicator'

    try:
        y = np.asarray(y)
    except ValueError:
        # Known to fail in numpy 1.3 for array of arrays
        return 'unknown'

    # The old sequence of sequences format
    try:
        if (not hasattr(y[0], '__array__') and isinstance(y[0], Sequence)
                and not isinstance(y[0], str)):
            raise ValueError('You appear to be using a legacy multi-label data'
                             ' representation. Sequence of sequences are no'
                             ' longer supported; use a binary array or sparse'
                             ' matrix instead - the MultiLabelBinarizer'
                             ' transformer can convert to this format.')
    except IndexError:
        pass

    # Invalid inputs
    if y.ndim > 2 or (y.dtype == object and len(y) != 0 and
                      not isinstance(y.flat[0], str)):
        return 'unknown'  # [[[1, 2]]] or [obj_1] and not ["label_1"]

    if y.ndim == 2 and y.shape[1] == 0:
        return 'unknown'  # [[]]

    if y.ndim == 2 and y.shape[1] > 1:
        suffix = "-multioutput"  # [[1, 2], [1, 2]]
    else:
        suffix = ""  # [1, 2, 3] or [[1], [2], [3]]

    # check float and contains non-integer float values
    if y.dtype.kind == 'f' and np.any(y != y.astype(int)):
        # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.]
        _daal_assert_all_finite(y)
        return 'continuous' + suffix

    unique = np.sort(pd.unique(y.ravel())) if pandas_is_imported else np.unique(y)

    if (len(unique) > 2) or (y.ndim >= 2 and len(y[0]) > 1):
        result = ('multiclass' + suffix, None)  # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]]
    else:
        result = ('binary', unique)  # [1, 2] or [["a"], ["b"]]
    return result
Beispiel #2
0
def test_is_multilabel():
    for group, group_examples in iteritems(EXAMPLES):
        if group.startswith("multilabel"):
            assert_, exp = assert_true, "True"
        else:
            assert_, exp = assert_false, "False"
        for example in group_examples:
            assert_(is_multilabel(example), msg="is_multilabel(%r) should be %s" % (example, exp))
def test_is_multilabel():
    for group, group_examples in iteritems(EXAMPLES):
        if group.startswith('multilabel'):
            assert_, exp = assert_true, 'True'
        else:
            assert_, exp = assert_false, 'False'
        for example in group_examples:
            assert_(is_multilabel(example),
                    msg='is_multilabel(%r) should be %s' % (example, exp))
def test_is_multilabel():
    for group, group_examples in iteritems(EXAMPLES):
        if group.startswith('multilabel'):
            assert_, exp = assert_true, 'True'
        else:
            assert_, exp = assert_false, 'False'
        for example in group_examples:
            assert_(is_multilabel(example),
                    msg='is_multilabel(%r) should be %s' % (example, exp))
Beispiel #5
0
def test_is_multilabel():
    for group, group_examples in EXAMPLES.items():
        if group in ["multilabel-indicator"]:
            dense_exp = True
        else:
            dense_exp = False

        for example in group_examples:
            # Only mark explicitly defined sparse examples as valid sparse
            # multilabel-indicators
            if group == "multilabel-indicator" and issparse(example):
                sparse_exp = True
            else:
                sparse_exp = False

            if issparse(example) or (
                hasattr(example, "__array__")
                and np.asarray(example).ndim == 2
                and np.asarray(example).dtype.kind in "biuf"
                and np.asarray(example).shape[1] > 0
            ):
                examples_sparse = [
                    sparse_matrix(example)
                    for sparse_matrix in [
                        coo_matrix,
                        csc_matrix,
                        csr_matrix,
                        dok_matrix,
                        lil_matrix,
                    ]
                ]
                for exmpl_sparse in examples_sparse:
                    assert sparse_exp == is_multilabel(
                        exmpl_sparse
                    ), "is_multilabel(%r) should be %s" % (exmpl_sparse, sparse_exp)

            # Densify sparse examples before testing
            if issparse(example):
                example = example.toarray()

            assert dense_exp == is_multilabel(
                example
            ), "is_multilabel(%r) should be %s" % (example, dense_exp)
Beispiel #6
0
def test_is_multilabel():
    for group, group_examples in iteritems(EXAMPLES):
        if group in ['multilabel-indicator']:
            dense_assert_, dense_exp = assert_true, 'True'
        else:
            dense_assert_, dense_exp = assert_false, 'False'

        for example in group_examples:
            # Only mark explicitly defined sparse examples as valid sparse
            # multilabel-indicators
            if group == 'multilabel-indicator' and issparse(example):
                sparse_assert_, sparse_exp = assert_true, 'True'
            else:
                sparse_assert_, sparse_exp = assert_false, 'False'

            if (issparse(example) or
                (hasattr(example, '__array__') and
                 np.asarray(example).ndim == 2 and
                 np.asarray(example).dtype.kind in 'biuf' and
                 np.asarray(example).shape[1] > 0)):
                examples_sparse = [sparse_matrix(example)
                                   for sparse_matrix in [coo_matrix,
                                                         csc_matrix,
                                                         csr_matrix,
                                                         dok_matrix,
                                                         lil_matrix]]
                for exmpl_sparse in examples_sparse:
                    sparse_assert_(is_multilabel(exmpl_sparse),
                                   msg=('is_multilabel(%r)'
                                   ' should be %s')
                                   % (exmpl_sparse, sparse_exp))

            # Densify sparse examples before testing
            if issparse(example):
                example = example.toarray()

            dense_assert_(is_multilabel(example),
                          msg='is_multilabel(%r) should be %s'
                          % (example, dense_exp))
Beispiel #7
0
    def _prepare(self, X, Y):
        '''preprocess data before training'''
        check_classification_targets(Y)
        self.classes_ = np.unique(Y)
        if len(self.classes_) < 2:
            raise ValueError("The number of classes has to be almost 2; got ",
                             len(self.classes_))
        self.multiclass_ = len(self.classes_) > 2 or is_multilabel(Y)

        KL = process_list(
            X, self.generator)  # X can be a samples matrix or Kernels List
        self.KL, self.Y = check_KL_Y(KL, Y)
        self.n_kernels = len(self.KL)
Beispiel #8
0
    def fit(self, X, y):
        """Implementation of the fitting function for the uncertainty-aware classifier.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
        y : array-like, shape (n_samples,) or (n_samples, n_outputs)
            The class labels

        Returns
        -------
        self : object
            Returns self.
        """
        self.random_state_ = check_random_state(self.random_state)
        # Check that X and y have correct shape
        X, y = check_X_y(X, y, multi_output=True)
        # Check whether base estimator supports probabilities
        if not hasattr(self.estimator, 'predict_proba'):
            raise NotFittedError("{0} does not support \
                    probabilistic predictions.".format(self.estimator))
        # Check if mc_sample_size is float
        if not isinstance(self.mc_sample_size, float):
            raise TypeError("Parameter mc_sample_size must be of type float.")
        # Check if n_mc_samples is integer
        if not isinstance(self.n_mc_samples, int):
            raise TypeError("Parameter n_mc_samples must be of type int.")
        # Check if n_jobs is integer
        if not self.n_jobs is None:
            if not isinstance(self.n_jobs, int):
                raise TypeError("Parameter n_jobs must be of type int.")
        # Store the number of outputs, classes for each output and complete data seen during fit
        if is_multilabel(y):
            self.n_outputs_ = y.shape[1]
        else:
            self.n_outputs_ = 1
        self.classes_ = unique_labels(y)
        self.X_ = X
        self.y_ = y
        # Now initialize and fit the ensemble
        self.n_samples_ = int(X.shape[0] * self.mc_sample_size)
        start_time = time.time()
        self.ensemble_ = p.fit(self)
        stop_time = time.time()
        if self.verbose >= 1:
            print(
                _message_with_time("UAClassifier", "fitting",
                                   stop_time - start_time))

        return self
Beispiel #9
0
def get_tag_counts(Y):
    """Compute the total number of documents that contain term t for all terms
    in the vocabulary.

    Parameters
    ----------
    Y : ndarray, shape = [n_samples, n_classes]
        The tags as multilabel array (or list of lists.)

    Returns
    -------
    tag_counts : ndarray, shape = [n_samples]
        The inverse document frequency of all tags in the vocabulary.
    """
    if is_multilabel(Y):
        return Y.sum(axis=0)
    else:
        return pd.DataFrame(Y.apply(pd.Series).stack()).groupby(0).size()
Beispiel #10
0
def test_is_multilabel():
    assert_true(is_multilabel([[1], [2], [0, 1]]))
    assert_true(is_multilabel([[1], [2]]))
    assert_true(is_multilabel([[1], [2], []]))
    assert_true(is_multilabel([[1], [0, 2], []]))
    assert_true(is_multilabel(np.random.randint(2, size=(10, 10))))

    assert_false(is_multilabel(range(10)))
    assert_false(is_multilabel(np.arange(10)))
    assert_false(is_multilabel(np.reshape(np.arange(10), (1, -1))))
    assert_false(is_multilabel(np.reshape(np.arange(10), (-1, 1))))
    assert_false(is_multilabel(np.random.randint(2, size=(10, ))))
    assert_false(is_multilabel(np.random.randint(2, size=(10, 1))))
def test_is_multilabel():
    assert_true(is_multilabel([[1], [2], [0, 1]]))
    assert_true(is_multilabel([[1], [2]]))
    assert_true(is_multilabel([[1], [2], []]))
    assert_true(is_multilabel([[1], [0, 2], []]))
    assert_true(is_multilabel(np.random.randint(2, size=(10, 10))))

    assert_false(is_multilabel(range(10)))
    assert_false(is_multilabel(np.arange(10)))
    assert_false(is_multilabel(np.reshape(np.arange(10), (1, -1))))
    assert_false(is_multilabel(np.reshape(np.arange(10), (-1, 1))))
    assert_false(is_multilabel(np.random.randint(2, size=(10, ))))
    assert_false(is_multilabel(np.random.randint(2, size=(10, 1))))
Beispiel #12
0
 def _populate_result(self, result: EvaluationResult, predictions,
                      ground_truth, current_fold: int, num_folds: int):
     if is_multilabel(ground_truth):
         hamming_loss = metrics.hamming_loss(ground_truth, predictions)
         result.put(HAMMING_LOSS, hamming_loss, current_fold, num_folds)
         result.put(HAMMING_ACCURACY, 1 - hamming_loss, current_fold,
                    num_folds)
         subset_accuracy = metrics.accuracy_score(ground_truth, predictions)
         result.put(SUBSET_ACCURACY, subset_accuracy, current_fold,
                    num_folds)
         result.put(SUBSET_ZERO_ONE_LOSS, 1 - subset_accuracy, current_fold,
                    num_folds)
         result.put(
             MICRO_PRECISION,
             metrics.precision_score(ground_truth,
                                     predictions,
                                     average='micro',
                                     zero_division=1), current_fold,
             num_folds)
         result.put(
             MICRO_RECALL,
             metrics.recall_score(ground_truth,
                                  predictions,
                                  average='micro',
                                  zero_division=1), current_fold, num_folds)
         result.put(
             MICRO_F1,
             metrics.f1_score(ground_truth,
                              predictions,
                              average='micro',
                              zero_division=1), current_fold, num_folds)
         result.put(
             MACRO_PRECISION,
             metrics.precision_score(ground_truth,
                                     predictions,
                                     average='macro',
                                     zero_division=1), current_fold,
             num_folds)
         result.put(
             MACRO_RECALL,
             metrics.recall_score(ground_truth,
                                  predictions,
                                  average='macro',
                                  zero_division=1), current_fold, num_folds)
         result.put(
             MACRO_F1,
             metrics.f1_score(ground_truth,
                              predictions,
                              average='macro',
                              zero_division=1), current_fold, num_folds)
         result.put(
             EX_BASED_PRECISION,
             metrics.precision_score(ground_truth,
                                     predictions,
                                     average='samples',
                                     zero_division=1), current_fold,
             num_folds)
         result.put(
             EX_BASED_RECALL,
             metrics.recall_score(ground_truth,
                                  predictions,
                                  average='samples',
                                  zero_division=1), current_fold, num_folds)
         result.put(
             EX_BASED_F1,
             metrics.f1_score(ground_truth,
                              predictions,
                              average='samples',
                              zero_division=1), current_fold, num_folds)
     else:
         predictions = np.ravel(
             enforce_dense(predictions, order='C', dtype=DTYPE_UINT8))
         ground_truth = np.ravel(
             enforce_dense(ground_truth, order='C', dtype=DTYPE_UINT8))
         accuracy = metrics.accuracy_score(ground_truth, predictions)
         result.put(ACCURACY, accuracy, current_fold, num_folds)
         result.put(ZERO_ONE_LOSS, 1 - accuracy, current_fold, num_folds)
         result.put(
             PRECISION,
             metrics.precision_score(ground_truth,
                                     predictions,
                                     zero_division=1), current_fold,
             num_folds)
         result.put(
             RECALL,
             metrics.recall_score(ground_truth,
                                  predictions,
                                  zero_division=1), current_fold, num_folds)
         result.put(
             F1, metrics.f1_score(ground_truth,
                                  predictions,
                                  zero_division=1), current_fold, num_folds)
Beispiel #13
0
def dataset_for_train_test_split(X_train, X_test, Y_train, Y_test, threshold=1,
                                 multi_word_queries=False, scaler='standard'):
    """Make dataset from a train-test-split

    This function scales the input data und generates queries and query-weights
    from the training set vocabulary.

    Parameters
    ----------
    X_train : array-like, shape = [n_train_samples, n_features]
        Training set data

    X_test : array-like, shape = [n_test_samples, n_features]
        Test set data.

    Y_train : array-like, shape = [n_train_samples, n_classes]
        Training set labels.

    Y_test : array-like, shape = [n_test_samples, n_classes]
        Test set labels.

    threshold : int, default: 1
        The threshold ...

    multi_word_queries : bool, default: ``False``
        Generate multi-word queries from real-world user-queries for the
        Freesound dataset if set to ``True``. Ultimately calls
        :func:`cbar.preprocess.get_relevant_queries`

    scaler : str, 'standard' or 'robust', or None
        Use either :func:`sklearn.preprocessing.StandardScaler` or
        :func:`sklearn.preprocessing.RobustScaler` to scale the input data

    Returns
    -------
    X_train : array-like, shape = [n_train_samples, n_features]
        The scaled training data.

    X_test : array-like, shape = [n_test_samples, n_features]
        The scaled test data.

    Y_train_bin : array-like, shape = [n_train_samples, n_classes]
        The training labels in binary indicator format.

    Y_test_bin : array-like, shape = [n_test_samples, n_classes]
        The test labels in binary indicator format.

    Q_vec : array-like, shape = [n_queries, n_classes]
        The query vectors to evaluate

    weights : array-like, shape = [n_queries]
        The weights used to weight the queries during evaluation. For one-word
        queries the weight for each query is the same. For multi-word queries
        the counts from the aggregrated query-log of user-queries are used
        to weight the queries accordingly.
    """
    if scaler:
        scale = dict(standard=StandardScaler(),
                     robust=RobustScaler()).get(scaler)
        X_train = scale.fit_transform(X_train)
        X_test = scale.transform(X_test)

    mlb = MultiLabelBinarizer()

    if is_multilabel(Y_train):
        Y_train_bin = Y_train
        Y_test_bin = Y_test
    else:
        mlb.fit(np.append(Y_train, Y_test))
        Y_train_bin = mlb.transform(Y_train)
        Y_test_bin = mlb.transform(Y_test)

    n_classes = Y_train_bin.shape[1]

    if multi_word_queries:
        Q = load_freesound_queries(Y_train)
        Q_bin = mlb.transform(Q.q)
    else:
        Q_bin = np.eye(n_classes)

    Y_train_rel = make_relevance_matrix(Q_bin, Y_train_bin)
    Y_test_rel = make_relevance_matrix(Q_bin, Y_test_bin)

    # only keep queries that have at least X relevant sounds in train and test
    mask = np.logical_and(Y_train_rel.sum(axis=1) >= threshold,
                          Y_test_rel.sum(axis=1) >= threshold)
    Q_bin_final = Q_bin[mask]

    if multi_word_queries:
        Q_reduced = Q[mask]
        weights = Q_reduced.cnt / Q_reduced.cnt.sum()
    else:
        weights = np.ones(Q_bin_final.shape[0]) / float(Q_bin_final.shape[0])

    idf = inverse_document_frequency(Y_train_bin)
    Q_vec = query_weights(Q_bin_final, idf)

    return (X_train, X_test,
            Y_train_bin, Y_test_bin,
            Q_vec, weights)