def _daal_type_of_target(y): valid = ((isinstance(y, (Sequence, spmatrix)) or hasattr(y, '__array__')) and not isinstance(y, str)) if not valid: raise ValueError('Expected array-like (array or non-string sequence), ' 'got %r' % y) sparse_pandas = (y.__class__.__name__ in ['SparseSeries', 'SparseArray']) if sparse_pandas: raise ValueError("y cannot be class 'SparseSeries' or 'SparseArray'") if is_multilabel(y): return 'multilabel-indicator' try: y = np.asarray(y) except ValueError: # Known to fail in numpy 1.3 for array of arrays return 'unknown' # The old sequence of sequences format try: if (not hasattr(y[0], '__array__') and isinstance(y[0], Sequence) and not isinstance(y[0], str)): raise ValueError('You appear to be using a legacy multi-label data' ' representation. Sequence of sequences are no' ' longer supported; use a binary array or sparse' ' matrix instead - the MultiLabelBinarizer' ' transformer can convert to this format.') except IndexError: pass # Invalid inputs if y.ndim > 2 or (y.dtype == object and len(y) != 0 and not isinstance(y.flat[0], str)): return 'unknown' # [[[1, 2]]] or [obj_1] and not ["label_1"] if y.ndim == 2 and y.shape[1] == 0: return 'unknown' # [[]] if y.ndim == 2 and y.shape[1] > 1: suffix = "-multioutput" # [[1, 2], [1, 2]] else: suffix = "" # [1, 2, 3] or [[1], [2], [3]] # check float and contains non-integer float values if y.dtype.kind == 'f' and np.any(y != y.astype(int)): # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.] _daal_assert_all_finite(y) return 'continuous' + suffix unique = np.sort(pd.unique(y.ravel())) if pandas_is_imported else np.unique(y) if (len(unique) > 2) or (y.ndim >= 2 and len(y[0]) > 1): result = ('multiclass' + suffix, None) # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]] else: result = ('binary', unique) # [1, 2] or [["a"], ["b"]] return result
def test_is_multilabel(): for group, group_examples in iteritems(EXAMPLES): if group.startswith("multilabel"): assert_, exp = assert_true, "True" else: assert_, exp = assert_false, "False" for example in group_examples: assert_(is_multilabel(example), msg="is_multilabel(%r) should be %s" % (example, exp))
def test_is_multilabel(): for group, group_examples in iteritems(EXAMPLES): if group.startswith('multilabel'): assert_, exp = assert_true, 'True' else: assert_, exp = assert_false, 'False' for example in group_examples: assert_(is_multilabel(example), msg='is_multilabel(%r) should be %s' % (example, exp))
def test_is_multilabel(): for group, group_examples in EXAMPLES.items(): if group in ["multilabel-indicator"]: dense_exp = True else: dense_exp = False for example in group_examples: # Only mark explicitly defined sparse examples as valid sparse # multilabel-indicators if group == "multilabel-indicator" and issparse(example): sparse_exp = True else: sparse_exp = False if issparse(example) or ( hasattr(example, "__array__") and np.asarray(example).ndim == 2 and np.asarray(example).dtype.kind in "biuf" and np.asarray(example).shape[1] > 0 ): examples_sparse = [ sparse_matrix(example) for sparse_matrix in [ coo_matrix, csc_matrix, csr_matrix, dok_matrix, lil_matrix, ] ] for exmpl_sparse in examples_sparse: assert sparse_exp == is_multilabel( exmpl_sparse ), "is_multilabel(%r) should be %s" % (exmpl_sparse, sparse_exp) # Densify sparse examples before testing if issparse(example): example = example.toarray() assert dense_exp == is_multilabel( example ), "is_multilabel(%r) should be %s" % (example, dense_exp)
def test_is_multilabel(): for group, group_examples in iteritems(EXAMPLES): if group in ['multilabel-indicator']: dense_assert_, dense_exp = assert_true, 'True' else: dense_assert_, dense_exp = assert_false, 'False' for example in group_examples: # Only mark explicitly defined sparse examples as valid sparse # multilabel-indicators if group == 'multilabel-indicator' and issparse(example): sparse_assert_, sparse_exp = assert_true, 'True' else: sparse_assert_, sparse_exp = assert_false, 'False' if (issparse(example) or (hasattr(example, '__array__') and np.asarray(example).ndim == 2 and np.asarray(example).dtype.kind in 'biuf' and np.asarray(example).shape[1] > 0)): examples_sparse = [sparse_matrix(example) for sparse_matrix in [coo_matrix, csc_matrix, csr_matrix, dok_matrix, lil_matrix]] for exmpl_sparse in examples_sparse: sparse_assert_(is_multilabel(exmpl_sparse), msg=('is_multilabel(%r)' ' should be %s') % (exmpl_sparse, sparse_exp)) # Densify sparse examples before testing if issparse(example): example = example.toarray() dense_assert_(is_multilabel(example), msg='is_multilabel(%r) should be %s' % (example, dense_exp))
def _prepare(self, X, Y): '''preprocess data before training''' check_classification_targets(Y) self.classes_ = np.unique(Y) if len(self.classes_) < 2: raise ValueError("The number of classes has to be almost 2; got ", len(self.classes_)) self.multiclass_ = len(self.classes_) > 2 or is_multilabel(Y) KL = process_list( X, self.generator) # X can be a samples matrix or Kernels List self.KL, self.Y = check_KL_Y(KL, Y) self.n_kernels = len(self.KL)
def fit(self, X, y): """Implementation of the fitting function for the uncertainty-aware classifier. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The training input samples. y : array-like, shape (n_samples,) or (n_samples, n_outputs) The class labels Returns ------- self : object Returns self. """ self.random_state_ = check_random_state(self.random_state) # Check that X and y have correct shape X, y = check_X_y(X, y, multi_output=True) # Check whether base estimator supports probabilities if not hasattr(self.estimator, 'predict_proba'): raise NotFittedError("{0} does not support \ probabilistic predictions.".format(self.estimator)) # Check if mc_sample_size is float if not isinstance(self.mc_sample_size, float): raise TypeError("Parameter mc_sample_size must be of type float.") # Check if n_mc_samples is integer if not isinstance(self.n_mc_samples, int): raise TypeError("Parameter n_mc_samples must be of type int.") # Check if n_jobs is integer if not self.n_jobs is None: if not isinstance(self.n_jobs, int): raise TypeError("Parameter n_jobs must be of type int.") # Store the number of outputs, classes for each output and complete data seen during fit if is_multilabel(y): self.n_outputs_ = y.shape[1] else: self.n_outputs_ = 1 self.classes_ = unique_labels(y) self.X_ = X self.y_ = y # Now initialize and fit the ensemble self.n_samples_ = int(X.shape[0] * self.mc_sample_size) start_time = time.time() self.ensemble_ = p.fit(self) stop_time = time.time() if self.verbose >= 1: print( _message_with_time("UAClassifier", "fitting", stop_time - start_time)) return self
def get_tag_counts(Y): """Compute the total number of documents that contain term t for all terms in the vocabulary. Parameters ---------- Y : ndarray, shape = [n_samples, n_classes] The tags as multilabel array (or list of lists.) Returns ------- tag_counts : ndarray, shape = [n_samples] The inverse document frequency of all tags in the vocabulary. """ if is_multilabel(Y): return Y.sum(axis=0) else: return pd.DataFrame(Y.apply(pd.Series).stack()).groupby(0).size()
def test_is_multilabel(): assert_true(is_multilabel([[1], [2], [0, 1]])) assert_true(is_multilabel([[1], [2]])) assert_true(is_multilabel([[1], [2], []])) assert_true(is_multilabel([[1], [0, 2], []])) assert_true(is_multilabel(np.random.randint(2, size=(10, 10)))) assert_false(is_multilabel(range(10))) assert_false(is_multilabel(np.arange(10))) assert_false(is_multilabel(np.reshape(np.arange(10), (1, -1)))) assert_false(is_multilabel(np.reshape(np.arange(10), (-1, 1)))) assert_false(is_multilabel(np.random.randint(2, size=(10, )))) assert_false(is_multilabel(np.random.randint(2, size=(10, 1))))
def _populate_result(self, result: EvaluationResult, predictions, ground_truth, current_fold: int, num_folds: int): if is_multilabel(ground_truth): hamming_loss = metrics.hamming_loss(ground_truth, predictions) result.put(HAMMING_LOSS, hamming_loss, current_fold, num_folds) result.put(HAMMING_ACCURACY, 1 - hamming_loss, current_fold, num_folds) subset_accuracy = metrics.accuracy_score(ground_truth, predictions) result.put(SUBSET_ACCURACY, subset_accuracy, current_fold, num_folds) result.put(SUBSET_ZERO_ONE_LOSS, 1 - subset_accuracy, current_fold, num_folds) result.put( MICRO_PRECISION, metrics.precision_score(ground_truth, predictions, average='micro', zero_division=1), current_fold, num_folds) result.put( MICRO_RECALL, metrics.recall_score(ground_truth, predictions, average='micro', zero_division=1), current_fold, num_folds) result.put( MICRO_F1, metrics.f1_score(ground_truth, predictions, average='micro', zero_division=1), current_fold, num_folds) result.put( MACRO_PRECISION, metrics.precision_score(ground_truth, predictions, average='macro', zero_division=1), current_fold, num_folds) result.put( MACRO_RECALL, metrics.recall_score(ground_truth, predictions, average='macro', zero_division=1), current_fold, num_folds) result.put( MACRO_F1, metrics.f1_score(ground_truth, predictions, average='macro', zero_division=1), current_fold, num_folds) result.put( EX_BASED_PRECISION, metrics.precision_score(ground_truth, predictions, average='samples', zero_division=1), current_fold, num_folds) result.put( EX_BASED_RECALL, metrics.recall_score(ground_truth, predictions, average='samples', zero_division=1), current_fold, num_folds) result.put( EX_BASED_F1, metrics.f1_score(ground_truth, predictions, average='samples', zero_division=1), current_fold, num_folds) else: predictions = np.ravel( enforce_dense(predictions, order='C', dtype=DTYPE_UINT8)) ground_truth = np.ravel( enforce_dense(ground_truth, order='C', dtype=DTYPE_UINT8)) accuracy = metrics.accuracy_score(ground_truth, predictions) result.put(ACCURACY, accuracy, current_fold, num_folds) result.put(ZERO_ONE_LOSS, 1 - accuracy, current_fold, num_folds) result.put( PRECISION, metrics.precision_score(ground_truth, predictions, zero_division=1), current_fold, num_folds) result.put( RECALL, metrics.recall_score(ground_truth, predictions, zero_division=1), current_fold, num_folds) result.put( F1, metrics.f1_score(ground_truth, predictions, zero_division=1), current_fold, num_folds)
def dataset_for_train_test_split(X_train, X_test, Y_train, Y_test, threshold=1, multi_word_queries=False, scaler='standard'): """Make dataset from a train-test-split This function scales the input data und generates queries and query-weights from the training set vocabulary. Parameters ---------- X_train : array-like, shape = [n_train_samples, n_features] Training set data X_test : array-like, shape = [n_test_samples, n_features] Test set data. Y_train : array-like, shape = [n_train_samples, n_classes] Training set labels. Y_test : array-like, shape = [n_test_samples, n_classes] Test set labels. threshold : int, default: 1 The threshold ... multi_word_queries : bool, default: ``False`` Generate multi-word queries from real-world user-queries for the Freesound dataset if set to ``True``. Ultimately calls :func:`cbar.preprocess.get_relevant_queries` scaler : str, 'standard' or 'robust', or None Use either :func:`sklearn.preprocessing.StandardScaler` or :func:`sklearn.preprocessing.RobustScaler` to scale the input data Returns ------- X_train : array-like, shape = [n_train_samples, n_features] The scaled training data. X_test : array-like, shape = [n_test_samples, n_features] The scaled test data. Y_train_bin : array-like, shape = [n_train_samples, n_classes] The training labels in binary indicator format. Y_test_bin : array-like, shape = [n_test_samples, n_classes] The test labels in binary indicator format. Q_vec : array-like, shape = [n_queries, n_classes] The query vectors to evaluate weights : array-like, shape = [n_queries] The weights used to weight the queries during evaluation. For one-word queries the weight for each query is the same. For multi-word queries the counts from the aggregrated query-log of user-queries are used to weight the queries accordingly. """ if scaler: scale = dict(standard=StandardScaler(), robust=RobustScaler()).get(scaler) X_train = scale.fit_transform(X_train) X_test = scale.transform(X_test) mlb = MultiLabelBinarizer() if is_multilabel(Y_train): Y_train_bin = Y_train Y_test_bin = Y_test else: mlb.fit(np.append(Y_train, Y_test)) Y_train_bin = mlb.transform(Y_train) Y_test_bin = mlb.transform(Y_test) n_classes = Y_train_bin.shape[1] if multi_word_queries: Q = load_freesound_queries(Y_train) Q_bin = mlb.transform(Q.q) else: Q_bin = np.eye(n_classes) Y_train_rel = make_relevance_matrix(Q_bin, Y_train_bin) Y_test_rel = make_relevance_matrix(Q_bin, Y_test_bin) # only keep queries that have at least X relevant sounds in train and test mask = np.logical_and(Y_train_rel.sum(axis=1) >= threshold, Y_test_rel.sum(axis=1) >= threshold) Q_bin_final = Q_bin[mask] if multi_word_queries: Q_reduced = Q[mask] weights = Q_reduced.cnt / Q_reduced.cnt.sum() else: weights = np.ones(Q_bin_final.shape[0]) / float(Q_bin_final.shape[0]) idf = inverse_document_frequency(Y_train_bin) Q_vec = query_weights(Q_bin_final, idf) return (X_train, X_test, Y_train_bin, Y_test_bin, Q_vec, weights)