def _transform_selected(X, transform, selected="all", copy=True): """Apply a transform function to portion of selected features Parameters ---------- X : {array-like, sparse matrix}, shape [n_samples, n_features] Dense array or sparse matrix. transform : callable A callable transform(X) -> X_transformed copy : boolean, optional Copy X even if it could be avoided. selected: "all" or array of indices or mask Specify which features to apply the transform to. Returns ------- X : array or sparse matrix, shape=(n_samples, n_features_new) """ if isinstance(selected, str) and selected == "all": return transform(X) X = check_array(X, accept_sparse='csc', copy=copy, dtype=FLOAT_DTYPES) if len(selected) == 0: return X n_features = X.shape[1] ind = np.arange(n_features) sel = np.zeros(n_features, dtype=bool) sel[np.asarray(selected)] = True not_sel = np.logical_not(sel) n_selected = np.sum(sel) if n_selected == 0: # No features selected. return X elif n_selected == n_features: # All features selected. return transform(X) else: X_sel = transform(X[:, ind[sel]]) X_not_sel = X[:, ind[not_sel]] if sparse.issparse(X_sel) or sparse.issparse(X_not_sel): return sparse.hstack((X_sel, X_not_sel)) else: return np.hstack((X_sel, X_not_sel))
def _fit_transform(self, X): """Assumes X contains only categorical features.""" X = check_array(X, dtype=np.int) if np.any(X < 0): raise ValueError("X needs to contain only non-negative integers.") n_samples, n_features = X.shape if (isinstance(self.n_values, str) and self.n_values == 'auto'): n_values = np.max(X, axis=0) + 1 elif isinstance(self.n_values, numbers.Integral): if (np.max(X, axis=0) >= self.n_values).any(): raise ValueError("Feature out of bounds for n_values=%d" % self.n_values) n_values = np.empty(n_features, dtype=np.int) n_values.fill(self.n_values) else: try: n_values = np.asarray(self.n_values, dtype=int) except (ValueError, TypeError): raise TypeError("Wrong type for parameter `n_values`. Expected" " 'auto', int or array of ints, got %r" % type(X)) if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]: raise ValueError("Shape mismatch: if n_values is an array," " it has to be of shape (n_features,).") self.n_values_ = n_values n_values = np.hstack([[0], n_values]) indices = np.cumsum(n_values) self.feature_indices_ = indices column_indices = (X + indices[:-1]).ravel() row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), n_features) data = np.ones(n_samples * n_features) out = sparse.coo_matrix((data, (row_indices, column_indices)), shape=(n_samples, indices[-1]), dtype=self.dtype).tocsr() if (isinstance(self.n_values, str) and self.n_values == 'auto'): mask = np.array(out.sum(axis=0)).ravel() != 0 active_features = np.where(mask)[0] out = out[:, active_features] self.active_features_ = active_features return out if self.sparse else out.toarray()
def _validate_X_predict(self, X, check_input): """Validate X whenever one tries to predict, apply, predict_proba""" if self.tree_ is None: raise NotFittedError("Estimator not fitted, " "call `fit` before exploiting the model.") if check_input: X = check_array(X, dtype=DTYPE, accept_sparse="csr") if issparse(X) and (X.indices.dtype != np.intc or X.indptr.dtype != np.intc): raise ValueError("No support for np.int64 index based " "sparse matrices") n_features = X.shape[1] if self.n_features_ != n_features: raise ValueError("Number of features of the model must " "match the input. Model n_features is %s and " "input n_features is %s " % (self.n_features_, n_features)) return X
def _transform(self, X): """Assumes X contains only categorical features.""" X = check_array(X, dtype=np.int) if np.any(X < 0): raise ValueError("X needs to contain only non-negative integers.") n_samples, n_features = X.shape indices = self.feature_indices_ if n_features != indices.shape[0] - 1: raise ValueError("X has different shape than during fitting." " Expected %d, got %d." % (indices.shape[0] - 1, n_features)) # We use only those categorical features of X that are known using fit. # i.e lesser than n_values_ using mask. # This means, if self.handle_unknown is "ignore", the row_indices and # col_indices corresponding to the unknown categorical feature are # ignored. mask = (X < self.n_values_).ravel() if np.any(~mask): if self.handle_unknown not in ['error', 'ignore']: raise ValueError("handle_unknown should be either error or " "unknown got %s" % self.handle_unknown) if self.handle_unknown == 'error': raise ValueError("unknown categorical feature present %s " "during transform." % X.ravel()[~mask]) column_indices = (X + indices[:-1]).ravel()[mask] row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), n_features)[mask] data = np.ones(np.sum(mask)) out = sparse.coo_matrix((data, (row_indices, column_indices)), shape=(n_samples, indices[-1]), dtype=self.dtype).tocsr() if (isinstance(self.n_values, str) and self.n_values == 'auto'): out = out[:, self.active_features_] return out if self.sparse else out.toarray()
def fit(self, X, y, group, sample_weight=None, check_input=True, X_idx_sorted=None): """Build a decision tree from the training set (X, y, group). Parameters ---------- X : array-like or sparse matrix, shape = [n_samples, n_features] The training input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csc_matrix``. y : array-like, shape = [n_samples] or [n_samples, n_outputs] The target values (class labels in classification, real numbers in regression). In the regression case, use ``dtype=np.float64`` and ``order='C'`` for maximum efficiency. group : array-like, shape = [n_samples] or [n_samples, n_outputs] The group values, 0 for control, 1 for target. sample_weight : array-like, shape = [n_samples] or None Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. In the case of classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. check_input : boolean, (default=True) Allow to bypass several input checking. Don't use this parameter unless you know what you do. X_idx_sorted : array-like, shape = [n_samples, n_features], optional The indexes of the sorted training input samples. If many tree are grown on the same dataset, this allows the ordering to be cached between trees. If None, the data will be sorted here. Don't use this parameter unless you know what to do. Returns ------- self : object Returns self. """ random_state = check_random_state(self.random_state) if check_input: X = check_array(X, dtype=DTYPE, accept_sparse="csc") y = check_array(y, ensure_2d=False, dtype=None) group = check_array(group, ensure_2d=False, dtype=None) if issparse(X): X.sort_indices() if X.indices.dtype != np.intc or X.indptr.dtype != np.intc: raise ValueError("No support for np.int64 index based " "sparse matrices") # Determine output settings n_samples, self.n_features_ = X.shape is_classification = isinstance(self, ClassifierMixin) y = np.atleast_1d(y) group = np.atleast_1d(group) expanded_class_weight = None if y.ndim == 1: y = np.reshape(y, (-1, 1)) if group.ndim == 1: group = np.reshape(group, (-1, 1)) self.n_outputs_ = y.shape[1] if is_classification: check_classification_targets(y) # Encode y & group together before passing to the builder. y = np.copy(2*group + y) self.classes_ = [] self.n_classes_ = [] if self.class_weight is not None: y_original = np.copy(y) y_encoded = np.zeros(y.shape, dtype=np.int) for k in range(self.n_outputs_): classes_k, y_encoded[:, k] = np.unique(y[:, k], return_inverse=True) self.classes_.append(classes_k) self.n_classes_.append(classes_k.shape[0]) y = y_encoded # TODO check if binary if self.class_weight is not None: expanded_class_weight = compute_sample_weight( self.class_weight, y_original) else: self.classes_ = [None] * self.n_outputs_ self.n_classes_ = [1] * self.n_outputs_ # TODO encode group and check if binary self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) # Check parameters max_depth = ((2 ** 31) - 1 if self.max_depth is None else self.max_depth) max_leaf_nodes = (-1 if self.max_leaf_nodes is None else self.max_leaf_nodes) if isinstance(self.min_samples_leaf, (numbers.Integral, np.integer)): min_samples_leaf = self.min_samples_leaf else: # float min_samples_leaf = int(ceil(self.min_samples_leaf * n_samples)) if isinstance(self.min_samples_split, (numbers.Integral, np.integer)): min_samples_split = self.min_samples_split else: # float min_samples_split = int(ceil(self.min_samples_split * n_samples)) min_samples_split = max(2, min_samples_split) min_samples_split = max(min_samples_split, 2 * min_samples_leaf) if isinstance(self.max_features, str): if self.max_features == "auto": if is_classification: max_features = max(1, int(np.sqrt(self.n_features_))) else: max_features = self.n_features_ elif self.max_features == "sqrt": max_features = max(1, int(np.sqrt(self.n_features_))) elif self.max_features == "log2": max_features = max(1, int(np.log2(self.n_features_))) else: raise ValueError( 'Invalid value for max_features. Allowed string ' 'values are "auto", "sqrt" or "log2".') elif self.max_features is None: max_features = self.n_features_ elif isinstance(self.max_features, (numbers.Integral, np.integer)): max_features = self.max_features else: # float if self.max_features > 0.0: max_features = max(1, int(self.max_features * self.n_features_)) else: max_features = 0 self.max_features_ = max_features if len(y) != n_samples: raise ValueError("Number of labels=%d does not match " "number of samples=%d" % (len(y), n_samples)) if not (0. < self.min_samples_split <= 1. or 2 <= self.min_samples_split): raise ValueError("min_samples_split must be in at least 2" " or in (0, 1], got %s" % min_samples_split) if not (0. < self.min_samples_leaf <= 0.5 or 1 <= self.min_samples_leaf): raise ValueError("min_samples_leaf must be at least than 1 " "or in (0, 0.5], got %s" % min_samples_leaf) if not 0 <= self.min_weight_fraction_leaf <= 0.5: raise ValueError("min_weight_fraction_leaf must in [0, 0.5]") if max_depth <= 0: raise ValueError("max_depth must be greater than zero. ") if not (0 < max_features <= self.n_features_): raise ValueError("max_features must be in (0, n_features]") if not isinstance(max_leaf_nodes, (numbers.Integral, np.integer)): raise ValueError("max_leaf_nodes must be integral number but was " "%r" % max_leaf_nodes) if -1 < max_leaf_nodes < 2: raise ValueError(("max_leaf_nodes {0} must be either smaller than " "0 or larger than 1").format(max_leaf_nodes)) if sample_weight is not None: if (getattr(sample_weight, "dtype", None) != DOUBLE or not sample_weight.flags.contiguous): sample_weight = np.ascontiguousarray( sample_weight, dtype=DOUBLE) if len(sample_weight.shape) > 1: raise ValueError("Sample weights array has more " "than one dimension: %d" % len(sample_weight.shape)) if len(sample_weight) != n_samples: raise ValueError("Number of weights=%d does not match " "number of samples=%d" % (len(sample_weight), n_samples)) if expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: sample_weight = expanded_class_weight # Set min_weight_leaf from min_weight_fraction_leaf if self.min_weight_fraction_leaf != 0. and sample_weight is not None: min_weight_leaf = (self.min_weight_fraction_leaf * np.sum(sample_weight)) else: min_weight_leaf = 0. presort = self.presort # Allow presort to be 'auto', which means True if the dataset is dense, # otherwise it will be False. if self.presort == 'auto' and issparse(X): presort = False elif self.presort == 'auto': presort = True if presort is True and issparse(X): raise ValueError("Presorting is not supported for sparse " "matrices.") # If multiple trees are built on the same dataset, we only want to # presort once. Splitters now can accept presorted indices if desired, # but do not handle any presorting themselves. Ensemble algorithms # which desire presorting must do presorting themselves and pass that # matrix into each tree. if X_idx_sorted is None and presort: X_idx_sorted = np.asfortranarray(np.argsort(X, axis=0), dtype=np.int32) if presort and X_idx_sorted.shape != X.shape: raise ValueError("The shape of X (X.shape = {}) doesn't match " "the shape of X_idx_sorted (X_idx_sorted" ".shape = {})".format(X.shape, X_idx_sorted.shape)) # Build tree criterion = self.criterion if not isinstance(criterion, Criterion): if is_classification: criterion = CRITERIA_CLF[self.criterion](self.n_outputs_, self.n_classes_) else: criterion = CRITERIA_REG[self.criterion](self.n_outputs_) SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS splitter = self.splitter if not isinstance(self.splitter, Splitter): splitter = SPLITTERS[self.splitter](criterion, self.max_features_, min_samples_leaf, min_weight_leaf, random_state, self.presort) self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_) # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise if max_leaf_nodes < 0: builder = DepthFirstTreeBuilder(splitter, min_samples_split, min_samples_leaf, min_weight_leaf, max_depth) else: builder = BestFirstTreeBuilder(splitter, min_samples_split, min_samples_leaf, min_weight_leaf, max_depth, max_leaf_nodes) builder.build(self.tree_, X, y, sample_weight, X_idx_sorted) if self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] return self
def fit(self, X, y, group, sample_weight=None, check_input=True, X_idx_sorted=None): """Build a decision tree from the training set (X, y, group). Parameters ---------- X : array-like or sparse matrix, shape = [n_samples, n_features] The training input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csc_matrix``. y : array-like, shape = [n_samples] or [n_samples, n_outputs] The target values (class labels in classification, real numbers in regression). In the regression case, use ``dtype=np.float64`` and ``order='C'`` for maximum efficiency. group : array-like, shape = [n_samples] or [n_samples, n_outputs] The group values, 0 for control, 1 for target. sample_weight : array-like, shape = [n_samples] or None Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. In the case of classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. check_input : boolean, (default=True) Allow to bypass several input checking. Don't use this parameter unless you know what you do. X_idx_sorted : array-like, shape = [n_samples, n_features], optional The indexes of the sorted training input samples. If many tree are grown on the same dataset, this allows the ordering to be cached between trees. If None, the data will be sorted here. Don't use this parameter unless you know what to do. Returns ------- self : object Returns self. """ random_state = check_random_state(self.random_state) if check_input: X = check_array(X, dtype=DTYPE, accept_sparse="csc") y = check_array(y, ensure_2d=False, dtype=None) group = check_array(group, ensure_2d=False, dtype=None) if issparse(X): X.sort_indices() if X.indices.dtype != np.intc or X.indptr.dtype != np.intc: raise ValueError("No support for np.int64 index based " "sparse matrices") # Determine output settings n_samples, self.n_features_ = X.shape is_classification = isinstance(self, ClassifierMixin) y = np.atleast_1d(y) group = np.atleast_1d(group) expanded_class_weight = None if y.ndim == 1: y = np.reshape(y, (-1, 1)) if group.ndim == 1: group = np.reshape(group, (-1, 1)) self.n_outputs_ = y.shape[1] if is_classification: check_classification_targets(y) # Encode y & group together before passing to the builder. y = np.copy(2 * group + y) self.classes_ = [] self.n_classes_ = [] if self.class_weight is not None: y_original = np.copy(y) y_encoded = np.zeros(y.shape, dtype=np.int) for k in range(self.n_outputs_): classes_k, y_encoded[:, k] = np.unique(y[:, k], return_inverse=True) self.classes_.append(classes_k) self.n_classes_.append(classes_k.shape[0]) y = y_encoded # TODO check if binary if self.class_weight is not None: expanded_class_weight = compute_sample_weight( self.class_weight, y_original) else: self.classes_ = [None] * self.n_outputs_ self.n_classes_ = [1] * self.n_outputs_ # TODO encode group and check if binary self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) # Check parameters max_depth = ((2**31) - 1 if self.max_depth is None else self.max_depth) max_leaf_nodes = (-1 if self.max_leaf_nodes is None else self.max_leaf_nodes) if isinstance(self.min_samples_leaf, (numbers.Integral, np.integer)): min_samples_leaf = self.min_samples_leaf else: # float min_samples_leaf = int(ceil(self.min_samples_leaf * n_samples)) if isinstance(self.min_samples_split, (numbers.Integral, np.integer)): min_samples_split = self.min_samples_split else: # float min_samples_split = int(ceil(self.min_samples_split * n_samples)) min_samples_split = max(2, min_samples_split) min_samples_split = max(min_samples_split, 2 * min_samples_leaf) if isinstance(self.max_features, str): if self.max_features == "auto": if is_classification: max_features = max(1, int(np.sqrt(self.n_features_))) else: max_features = self.n_features_ elif self.max_features == "sqrt": max_features = max(1, int(np.sqrt(self.n_features_))) elif self.max_features == "log2": max_features = max(1, int(np.log2(self.n_features_))) else: raise ValueError( 'Invalid value for max_features. Allowed string ' 'values are "auto", "sqrt" or "log2".') elif self.max_features is None: max_features = self.n_features_ elif isinstance(self.max_features, (numbers.Integral, np.integer)): max_features = self.max_features else: # float if self.max_features > 0.0: max_features = max(1, int(self.max_features * self.n_features_)) else: max_features = 0 self.max_features_ = max_features if len(y) != n_samples: raise ValueError("Number of labels=%d does not match " "number of samples=%d" % (len(y), n_samples)) if not (0. < self.min_samples_split <= 1. or 2 <= self.min_samples_split): raise ValueError("min_samples_split must be in at least 2" " or in (0, 1], got %s" % min_samples_split) if not (0. < self.min_samples_leaf <= 0.5 or 1 <= self.min_samples_leaf): raise ValueError("min_samples_leaf must be at least than 1 " "or in (0, 0.5], got %s" % min_samples_leaf) if not 0 <= self.min_weight_fraction_leaf <= 0.5: raise ValueError("min_weight_fraction_leaf must in [0, 0.5]") if max_depth <= 0: raise ValueError("max_depth must be greater than zero. ") if not (0 < max_features <= self.n_features_): raise ValueError("max_features must be in (0, n_features]") if not isinstance(max_leaf_nodes, (numbers.Integral, np.integer)): raise ValueError("max_leaf_nodes must be integral number but was " "%r" % max_leaf_nodes) if -1 < max_leaf_nodes < 2: raise ValueError(("max_leaf_nodes {0} must be either smaller than " "0 or larger than 1").format(max_leaf_nodes)) if sample_weight is not None: if (getattr(sample_weight, "dtype", None) != DOUBLE or not sample_weight.flags.contiguous): sample_weight = np.ascontiguousarray(sample_weight, dtype=DOUBLE) if len(sample_weight.shape) > 1: raise ValueError("Sample weights array has more " "than one dimension: %d" % len(sample_weight.shape)) if len(sample_weight) != n_samples: raise ValueError("Number of weights=%d does not match " "number of samples=%d" % (len(sample_weight), n_samples)) if expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: sample_weight = expanded_class_weight # Set min_weight_leaf from min_weight_fraction_leaf if self.min_weight_fraction_leaf != 0. and sample_weight is not None: min_weight_leaf = (self.min_weight_fraction_leaf * np.sum(sample_weight)) else: min_weight_leaf = 0. presort = self.presort # Allow presort to be 'auto', which means True if the dataset is dense, # otherwise it will be False. if self.presort == 'auto' and issparse(X): presort = False elif self.presort == 'auto': presort = True if presort is True and issparse(X): raise ValueError("Presorting is not supported for sparse " "matrices.") # If multiple trees are built on the same dataset, we only want to # presort once. Splitters now can accept presorted indices if desired, # but do not handle any presorting themselves. Ensemble algorithms # which desire presorting must do presorting themselves and pass that # matrix into each tree. if X_idx_sorted is None and presort: X_idx_sorted = np.asfortranarray(np.argsort(X, axis=0), dtype=np.int32) if presort and X_idx_sorted.shape != X.shape: raise ValueError("The shape of X (X.shape = {}) doesn't match " "the shape of X_idx_sorted (X_idx_sorted" ".shape = {})".format(X.shape, X_idx_sorted.shape)) # Build tree criterion = self.criterion if not isinstance(criterion, Criterion): if is_classification: criterion = CRITERIA_CLF[self.criterion](self.n_outputs_, self.n_classes_) else: criterion = CRITERIA_REG[self.criterion](self.n_outputs_) SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS splitter = self.splitter if not isinstance(self.splitter, Splitter): splitter = SPLITTERS[self.splitter](criterion, self.max_features_, min_samples_leaf, min_weight_leaf, random_state, self.presort) self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_) # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise if max_leaf_nodes < 0: builder = DepthFirstTreeBuilder(splitter, min_samples_split, min_samples_leaf, min_weight_leaf, max_depth) else: builder = BestFirstTreeBuilder(splitter, min_samples_split, min_samples_leaf, min_weight_leaf, max_depth, max_leaf_nodes) builder.build(self.tree_, X, y, sample_weight, X_idx_sorted) if self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] return self
def unique_labels(*ys): """Extract an ordered array of unique labels We don't allow: - mix of multilabel and multiclass (single label) targets - mix of label indicator matrix and anything else, because there are no explicit labels) - mix of label indicator matrices of different sizes - mix of string and integer labels At the moment, we also don't allow "multiclass-multioutput" input type. Parameters ---------- *ys : array-likes, Returns ------- out : numpy array of shape [n_unique_labels] An ordered array of unique labels. Examples -------- >>> from sklearn.utils.multiclass import unique_labels >>> unique_labels([3, 5, 5, 5, 7, 7]) array([3, 5, 7]) >>> unique_labels([1, 2, 3, 4], [2, 2, 3, 4]) array([1, 2, 3, 4]) >>> unique_labels([1, 2, 10], [5, 11]) array([ 1, 2, 5, 10, 11]) """ if not ys: raise ValueError('No argument has been passed.') # Check that we don't mix label format ys_types = set(type_of_target(x) for x in ys) if ys_types == set(["binary", "multiclass"]): ys_types = set(["multiclass"]) if len(ys_types) > 1: raise ValueError("Mix type of y not allowed, got types %s" % ys_types) label_type = ys_types.pop() # Check consistency for the indicator format if (label_type == "multilabel-indicator" and len( set(check_array(y, ['csr', 'csc', 'coo']).shape[1] for y in ys)) > 1): raise ValueError("Multi-label binary indicator input with " "different numbers of labels") # Get the unique set of labels _unique_labels = _FN_UNIQUE_LABELS.get(label_type, None) if not _unique_labels: raise ValueError("Unknown label type: %s" % repr(ys)) ys_labels = set(chain.from_iterable(_unique_labels(y) for y in ys)) # Check that we don't mix string type with number type if len(set(isinstance(label, str) for label in ys_labels)) > 1: raise ValueError("Mix of label input types (string and number)") return np.array(sorted(ys_labels))
def _unique_indicator(y): return np.arange(check_array(y, ['csr', 'csc', 'coo']).shape[1])
def unique_labels(*ys): """Extract an ordered array of unique labels We don't allow: - mix of multilabel and multiclass (single label) targets - mix of label indicator matrix and anything else, because there are no explicit labels) - mix of label indicator matrices of different sizes - mix of string and integer labels At the moment, we also don't allow "multiclass-multioutput" input type. Parameters ---------- *ys : array-likes, Returns ------- out : numpy array of shape [n_unique_labels] An ordered array of unique labels. Examples -------- >>> from sklearn.utils.multiclass import unique_labels >>> unique_labels([3, 5, 5, 5, 7, 7]) array([3, 5, 7]) >>> unique_labels([1, 2, 3, 4], [2, 2, 3, 4]) array([1, 2, 3, 4]) >>> unique_labels([1, 2, 10], [5, 11]) array([ 1, 2, 5, 10, 11]) """ if not ys: raise ValueError('No argument has been passed.') # Check that we don't mix label format ys_types = set(type_of_target(x) for x in ys) if ys_types == set(["binary", "multiclass"]): ys_types = set(["multiclass"]) if len(ys_types) > 1: raise ValueError("Mix type of y not allowed, got types %s" % ys_types) label_type = ys_types.pop() # Check consistency for the indicator format if (label_type == "multilabel-indicator" and len(set(check_array(y, ['csr', 'csc', 'coo']).shape[1] for y in ys)) > 1): raise ValueError("Multi-label binary indicator input with " "different numbers of labels") # Get the unique set of labels _unique_labels = _FN_UNIQUE_LABELS.get(label_type, None) if not _unique_labels: raise ValueError("Unknown label type: %s" % repr(ys)) ys_labels = set(chain.from_iterable(_unique_labels(y) for y in ys)) # Check that we don't mix string type with number type if len(set(isinstance(label, str) for label in ys_labels)) > 1: raise ValueError("Mix of label input types (string and number)") return np.array(sorted(ys_labels))