def test_label_binarize_with_class_order(): out = label_binarize([1, 6], classes=[1, 2, 4, 6]) expected = np.array([[1, 0, 0, 0], [0, 0, 0, 1]]) assert_array_equal(out, expected) # Modified class order out = label_binarize([1, 6], classes=[1, 6, 4, 2]) expected = np.array([[1, 0, 0, 0], [0, 1, 0, 0]]) assert_array_equal(out, expected)
def test_label_binarizer_errors(): # Check that invalid arguments yield ValueError one_class = np.array([0, 0, 0, 0]) lb = LabelBinarizer().fit(one_class) multi_label = [(2, 3), (0, ), (0, 2)] with pytest.raises(ValueError): lb.transform(multi_label) lb = LabelBinarizer() with pytest.raises(ValueError): lb.transform([]) with pytest.raises(ValueError): lb.inverse_transform([]) with pytest.raises(ValueError): LabelBinarizer(neg_label=2, pos_label=1) with pytest.raises(ValueError): LabelBinarizer(neg_label=2, pos_label=2) with pytest.raises(ValueError): LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True) # Fail on y_type with pytest.raises(ValueError): _inverse_binarize_thresholding(y=csr_matrix([[1, 2], [2, 1]]), output_type="foo", classes=[1, 2], threshold=0) # Sequence of seq type should raise ValueError y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]] with pytest.raises(ValueError): LabelBinarizer().fit_transform(y_seq_of_seqs) # Fail on the number of classes with pytest.raises(ValueError): _inverse_binarize_thresholding(y=csr_matrix([[1, 2], [2, 1]]), output_type="foo", classes=[1, 2, 3], threshold=0) # Fail on the dimension of 'binary' with pytest.raises(ValueError): _inverse_binarize_thresholding(y=np.array([[1, 2, 3], [2, 1, 3]]), output_type="binary", classes=[1, 2, 3], threshold=0) # Fail on multioutput data with pytest.raises(ValueError): LabelBinarizer().fit(np.array([[1, 3], [2, 1]])) with pytest.raises(ValueError): label_binarize(np.array([[1, 3], [2, 1]]), [1, 2, 3])
def test_label_binarize_multiclass(): y = [0, 1, 2] classes = [0, 1, 2] pos_label = 2 neg_label = 0 expected = 2 * np.eye(3) check_binarized_results(y, classes, pos_label, neg_label, expected) with pytest.raises(ValueError): label_binarize(y, classes, neg_label=-1, pos_label=pos_label, sparse_output=True)
def test_label_binarize_with_multilabel_indicator(): """Check that passing a binary indicator matrix is not noop""" classes = np.arange(3) neg_label = -1 pos_label = 2 y = np.array([[0, 1, 0], [1, 1, 1]]) expected = np.array([[-1, 2, -1], [2, 2, 2]]) # With label binarize output = label_binarize(y, classes, multilabel=True, neg_label=neg_label, pos_label=pos_label) assert_array_equal(output, expected) # With the transformer lb = LabelBinarizer(pos_label=pos_label, neg_label=neg_label) output = lb.fit_transform(y) assert_array_equal(output, expected) output = lb.fit(y).transform(y) assert_array_equal(output, expected)
def load_kaggle_mnist_train(filen, nrows=None, zero_to_negone=False): """ Reads in the Kaggle MNIST training dataset and returns X (input) and Y (label) data. Dataset should be a .csv or .csv.gz file (NOBSERVATIONS+1 x NDIMENSIONS+1), with first row = header, and first column = labels. """ print 'Loading %s Kaggle MNIST train patterns from %s' % (str(nrows) if nrows else 'all', filen) t = Stopwatch() panda = pd.read_csv(filen, delimiter=',', dtype=int, header=None, nrows=nrows, skiprows=1, compression=('gzip' if filen.endswith('.gz') else None)) data = panda.values # numpy array x = data[:,1:] y_vec = data[:,0] assert x.shape[0] == y_vec.shape[0] assert x.shape[1] == 784 assert len(y_vec.shape) == 1 assert np.min(y_vec) == 0 and np.max(y_vec) == 9 # turn labels from vector (with values from 0-9) to # NOBSERVATIONSx10 matrix, with a single 1 in each row (i.e. 1-vs-all) y = label_binarize(y_vec, classes=range(10)) assert y.shape == (x.shape[0], 10) assert all(np.sum(y, axis=1) == 1) if zero_to_negone: x[x==0] = -1 if nrows is not None: assert x.shape[0] == nrows print 'done: %r in %is' % (x.shape, t.finish(milli=False)) return x, y
def load_dataset(self): with open(self.file_name) as f: dataset = arff.load(f) if self.label_attribute is None: self.label_attribute = dataset["attributes"][-1][0] data = list(numpy.asarray(dataset["data"]).transpose()) labels = None row = 0 for attribute_name, attribute_type in dataset["attributes"]: if attribute_name == self.label_attribute: # Labels found! labels = data.pop(row) continue # Nominal attribute if isinstance(attribute_type, list): # Convert None in '?' for next check and to make label_binarize work for j in range(len(data[row])): if data[row][j] is None: data[row][j] = "?" if numpy.all(data[row] == "?"): # If no data is present, just remove the row data.pop(row) continue if self.binarize: data[row] = numpy.asarray(label_binarize( data[row], attribute_type), dtype=numpy.float64) else: encoder = LabelEncoder() encoder.classes_ = attribute_type if "?" not in encoder.classes_: encoder.classes_.insert(0, "?") data[row] = encoder.transform(data[row]).reshape( (len(data[row]), 1)).astype(numpy.float64) else: # Numeric attributes: check for nan values data[row] = data[row].astype(numpy.float64) nans = numpy.isnan(data[row]) if numpy.all(nans): # If everything is nan, remove the feature data.pop(row) continue if numpy.any(nans): mean = data[row][numpy.invert( nans)].sum() / numpy.invert(nans).sum() data[row][nans] = mean # Reshape to do hstack later data[row] = data[row].reshape((len(data[row]), 1)) # Go to next row only if we have NOT removed the current one row += 1 instances = numpy.hstack(tuple(data)) useless_indices = numpy.where(instances.var(axis=0) == 0) instances = numpy.delete(instances, useless_indices, axis=1) return instances, labels
def otto_dataset(params): df = pd.read_csv('train.csv.gz', index_col='id', nrows=params.get('n_rows')) features = df.drop(['target'], axis=1) labels = df.target.apply(lambda e: e[6:]).astype(np.int16) - 1 if params['est'] == 'keras': labels = label_binarize(labels, classes=sorted(set(labels))) return features, labels
def check_binarized_results(y, classes, pos_label, neg_label, expected): for sparse_output in [True, False]: if ((pos_label == 0 or neg_label != 0) and sparse_output): with pytest.raises(ValueError): label_binarize(y, classes, neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output) continue # check label_binarize binarized = label_binarize(y, classes, neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output) assert_array_equal(toarray(binarized), expected) assert issparse(binarized) == sparse_output # check inverse y_type = type_of_target(y) if y_type == "multiclass": inversed = _inverse_binarize_multiclass(binarized, classes=classes) else: inversed = _inverse_binarize_thresholding( binarized, output_type=y_type, classes=classes, threshold=((neg_label + pos_label) / 2.)) assert_array_equal(toarray(inversed), toarray(y)) # Check label binarizer lb = LabelBinarizer(neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output) binarized = lb.fit_transform(y) assert_array_equal(toarray(binarized), expected) assert issparse(binarized) == sparse_output inverse_output = lb.inverse_transform(binarized) assert_array_equal(toarray(inverse_output), toarray(y)) assert issparse(inverse_output) == issparse(y)
def load_dataset(self): with open(self.file_name) as f: dataset = arff.load(f) if self.label_attribute is None: self.label_attribute = dataset["attributes"][-1][0] data = list(numpy.asarray(dataset["data"]).transpose()) labels = None row = 0 for attribute_name, attribute_type in dataset["attributes"]: if attribute_name == self.label_attribute: # Labels found! labels = data.pop(row) continue # Nominal attribute if isinstance(attribute_type, list): # Convert None in '?' for next check and to make label_binarize work for j in range(len(data[row])): if data[row][j] is None: data[row][j] = "?" if numpy.all(data[row] == "?"): # If no data is present, just remove the row data.pop(row) continue if self.binarize: data[row] = numpy.asarray(label_binarize(data[row], attribute_type), dtype=numpy.float64) else: encoder = LabelEncoder() encoder.classes_ = attribute_type if "?" not in encoder.classes_: encoder.classes_.insert(0, "?") data[row] = encoder.transform(data[row]).reshape((len(data[row]), 1)).astype(numpy.float64) else: # Numeric attributes: check for nan values data[row] = data[row].astype(numpy.float64) nans = numpy.isnan(data[row]) if numpy.all(nans): # If everything is nan, remove the feature data.pop(row) continue if numpy.any(nans): mean = data[row][numpy.invert(nans)].sum() / numpy.invert(nans).sum() data[row][nans] = mean # Reshape to do hstack later data[row] = data[row].reshape((len(data[row]), 1)) # Go to next row only if we have NOT removed the current one row += 1 instances = numpy.hstack(tuple(data)) useless_indices = numpy.where(instances.var(axis=0) == 0) instances = numpy.delete(instances, useless_indices, axis=1) return instances, labels
def test_label_binarize_multilabel(): y_ind = np.array([[0, 1, 0], [1, 1, 1], [0, 0, 0]]) classes = [0, 1, 2] pos_label = 2 neg_label = 0 expected = pos_label * y_ind y_sparse = [ sparse_matrix(y_ind) for sparse_matrix in [coo_matrix, csc_matrix, csr_matrix, dok_matrix, lil_matrix] ] for y in [y_ind] + y_sparse: check_binarized_results(y, classes, pos_label, neg_label, expected) with pytest.raises(ValueError): label_binarize(y, classes, neg_label=-1, pos_label=pos_label, sparse_output=True)
def check_binarized_results(y, classes, pos_label, neg_label, expected): for sparse_output in [True, False]: if ((pos_label == 0 or neg_label != 0) and sparse_output): assert_raises(ValueError, label_binarize, y, classes, neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output) continue # check label_binarize binarized = label_binarize(y, classes, neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output) assert_array_equal(toarray(binarized), expected) assert_equal(issparse(binarized), sparse_output) # check inverse y_type = type_of_target(y) if y_type == "multiclass": inversed = _inverse_binarize_multiclass(binarized, classes=classes) else: inversed = _inverse_binarize_thresholding(binarized, output_type=y_type, classes=classes, threshold=((neg_label + pos_label) / 2.)) assert_array_equal(toarray(inversed), toarray(y)) # Check label binarizer lb = LabelBinarizer(neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output) binarized = lb.fit_transform(y) assert_array_equal(toarray(binarized), expected) assert_equal(issparse(binarized), sparse_output) inverse_output = lb.inverse_transform(binarized) assert_array_equal(toarray(inverse_output), toarray(y)) assert_equal(issparse(inverse_output), issparse(y))
def roc_auc_avg_score(y_true, y_score): y_bin = label_binarize(y_true, classes=sorted(set(y_true))) return roc_auc_score(y_bin, y_score)
def test_invalid_input_label_binarize(): with pytest.raises(ValueError): label_binarize([0, 2], classes=[0, 2], pos_label=0, neg_label=1)