def test_multilabel_binarizer_empty_sample(): mlb = MultiLabelBinarizer() y = [[1, 2], [1], []] Y = np.array([[1, 1], [1, 0], [0, 0]]) assert_array_equal(mlb.fit_transform(y), Y)
def test_multilabel_binarizer_non_integer_labels(): tuple_classes = np.empty(3, dtype=object) tuple_classes[:] = [(1,), (2,), (3,)] inputs = [ ([('2', '3'), ('1',), ('1', '2')], ['1', '2', '3']), ([('b', 'c'), ('a',), ('a', 'b')], ['a', 'b', 'c']), ([((2,), (3,)), ((1,),), ((1,), (2,))], tuple_classes), ] indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]]) for inp, classes in inputs: # fit_transform() mlb = MultiLabelBinarizer() assert_array_equal(mlb.fit_transform(inp), indicator_mat) assert_array_equal(mlb.classes_, classes) assert_array_equal(mlb.inverse_transform(indicator_mat), inp) # fit().transform() mlb = MultiLabelBinarizer() assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat) assert_array_equal(mlb.classes_, classes) assert_array_equal(mlb.inverse_transform(indicator_mat), inp) mlb = MultiLabelBinarizer() assert_raises(TypeError, mlb.fit_transform, [({}), ({}, {'a': 'b'})])
class MultihotEncoder(BaseEstimator, TransformerMixin): """ Wraps `MultiLabelBinarizer` in a pipeline safe transformer Args: sparse_output (bool): convert output to sparse matrix """ def __init__(self, sparse_output=False): self.transformer = MultiLabelBinarizer() self.sparse_output = sparse_output def fit(self, X, y=None): """ Fit MultiLabelBinarizer """ self.transformer.fit(X) return self def transform(self, X,y=None): """ Transform MultiLabelBinarizer """ # ignore unseen label warning with warnings.catch_warnings(): warnings.simplefilter("ignore") X_t = self.transformer.transform(X) if self.sparse_output: return sparse.csr_matrix(X_t) else: return X_t
def test_multilabel_binarizer_unknown_class(): mlb = MultiLabelBinarizer() y = [[1, 2]] assert_raises(KeyError, mlb.fit(y).transform, [[0]]) mlb = MultiLabelBinarizer(classes=[1, 2]) assert_raises(KeyError, mlb.fit_transform, [[0]])
def test_multilabel_binarizer_empty_sample(): mlb = MultiLabelBinarizer() y = [[1, 2], [1], []] Y = np.array([[1, 1], [1, 0], [0, 0]]) assert_array_equal(mlb.fit_transform(y), Y)
def test_multilabel_binarizer_unknown_class(): mlb = MultiLabelBinarizer() y = [[1, 2]] assert_raises(KeyError, mlb.fit(y).transform, [[0]]) mlb = MultiLabelBinarizer(classes=[1, 2]) assert_raises(KeyError, mlb.fit_transform, [[0]])
def test_multilabel_binarizer_given_classes(): inp = [(2, 3), (1,), (1, 2)] indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1]]) # fit_transform() mlb = MultiLabelBinarizer(classes=[1, 3, 2]) assert_array_equal(mlb.fit_transform(inp), indicator_mat) assert_array_equal(mlb.classes_, [1, 3, 2]) # fit().transform() mlb = MultiLabelBinarizer(classes=[1, 3, 2]) assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat) assert_array_equal(mlb.classes_, [1, 3, 2]) # ensure works with extra class mlb = MultiLabelBinarizer(classes=[4, 1, 3, 2]) assert_array_equal(mlb.fit_transform(inp), np.hstack(([[0], [0], [0]], indicator_mat))) assert_array_equal(mlb.classes_, [4, 1, 3, 2]) # ensure fit is no-op as iterable is not consumed inp = iter(inp) mlb = MultiLabelBinarizer(classes=[1, 3, 2]) assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat) # ensure a ValueError is thrown if given duplicate classes err_msg = "The classes argument contains duplicate classes. Remove " \ "these duplicates before passing them to MultiLabelBinarizer." mlb = MultiLabelBinarizer(classes=[1, 3, 2, 3]) assert_raise_message(ValueError, err_msg, mlb.fit, inp)
def test_multilabel_binarizer_multiple_calls(): inp = [(2, 3), (1, ), (1, 2)] indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1]]) indicator_mat2 = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]]) # first call mlb = MultiLabelBinarizer(classes=[1, 3, 2]) assert_array_equal(mlb.fit_transform(inp), indicator_mat) # second call change class mlb.classes = [1, 2, 3] assert_array_equal(mlb.fit_transform(inp), indicator_mat2)
def test_sparse_output_multilabel_binarizer(): # test input as iterable of iterables inputs = [ lambda: [(2, 3), (1, ), (1, 2)], lambda: (set([2, 3]), set([1]), set([1, 2])), lambda: iter([iter((2, 3)), iter( (1, )), set([1, 2])]), ] indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]]) inverse = inputs[0]() for sparse_output in [True, False]: for inp in inputs: # With fit_tranform mlb = MultiLabelBinarizer(sparse_output=sparse_output) got = mlb.fit_transform(inp()) assert_equal(issparse(got), sparse_output) if sparse_output: got = got.toarray() assert_array_equal(indicator_mat, got) assert_array_equal([1, 2, 3], mlb.classes_) assert_equal(mlb.inverse_transform(got), inverse) # With fit mlb = MultiLabelBinarizer(sparse_output=sparse_output) got = mlb.fit(inp()).transform(inp()) assert_equal(issparse(got), sparse_output) if sparse_output: got = got.toarray() assert_array_equal(indicator_mat, got) assert_array_equal([1, 2, 3], mlb.classes_) assert_equal(mlb.inverse_transform(got), inverse) assert_raises(ValueError, mlb.inverse_transform, csr_matrix(np.array([[0, 1, 1], [2, 0, 0], [1, 1, 0]])))
def test_multilabel_binarizer_unknown_class(): mlb = MultiLabelBinarizer() y = [[1, 2]] Y = np.array([[1, 0], [0, 1]]) w = 'unknown class(es) [0, 4] will be ignored' matrix = assert_warns_message(UserWarning, w, mlb.fit(y).transform, [[4, 1], [2, 0]]) assert_array_equal(matrix, Y) Y = np.array([[1, 0, 0], [0, 1, 0]]) mlb = MultiLabelBinarizer(classes=[1, 2, 3]) matrix = assert_warns_message(UserWarning, w, mlb.fit(y).transform, [[4, 1], [2, 0]]) assert_array_equal(matrix, Y)
def test_multilabel_binarizer(): # test input as iterable of iterables inputs = [ lambda: [(2, 3), (1,), (1, 2)], lambda: (set([2, 3]), set([1]), set([1, 2])), lambda: iter([iter((2, 3)), iter((1,)), set([1, 2])]), ] indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]]) inverse = inputs[0]() for inp in inputs: # With fit_transform mlb = MultiLabelBinarizer() got = mlb.fit_transform(inp()) assert_array_equal(indicator_mat, got) assert_array_equal([1, 2, 3], mlb.classes_) assert_equal(mlb.inverse_transform(got), inverse) # With fit mlb = MultiLabelBinarizer() got = mlb.fit(inp()).transform(inp()) assert_array_equal(indicator_mat, got) assert_array_equal([1, 2, 3], mlb.classes_) assert_equal(mlb.inverse_transform(got), inverse)
def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self
def fit_binarizers(all_values): binarizers = {} for f in range(len(all_values[0])): cur_features = [context[f] for context in all_values] # only categorical values need to be binarized, ints/floats are left as they are if type(cur_features[0]) == str or type(cur_features[0]) == unicode: lb = LabelBinarizer() lb.fit(cur_features) binarizers[f] = lb elif type(cur_features[0]) == list: mlb = MultiLabelBinarizer() # default feature for unknown values cur_features.append(tuple(("__unk__",))) mlb.fit([tuple(x) for x in cur_features]) binarizers[f] = mlb return binarizers
def test_multilabel_binarizer_multiple_calls(): inp = [(2, 3), (1,), (1, 2)] indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1]]) indicator_mat2 = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]]) # first call mlb = MultiLabelBinarizer(classes=[1, 3, 2]) assert_array_equal(mlb.fit_transform(inp), indicator_mat) # second call change class mlb.classes = [1, 2, 3] assert_array_equal(mlb.fit_transform(inp), indicator_mat2)
def fit_binarizers(all_values): binarizers = {} for f in range(len(all_values[0])): cur_features = [context[f] for context in all_values] # only categorical values need to be binarized, ints/floats are left as they are if type(cur_features[0]) == str or type(cur_features[0]) == unicode: lb = LabelBinarizer() lb.fit(cur_features) binarizers[f] = lb elif type(cur_features[0]) == list: mlb = MultiLabelBinarizer() # default feature for unknown values cur_features.append(tuple(("__unk__",))) mlb.fit([tuple(x) for x in cur_features]) binarizers[f] = mlb return binarizers
def classif_missing_kws(): # texts = ["wireless networks", # "networks algorithm", # "algorithm em", # "wireless"] # labels = [("l1", "l2"), # ("l2", "l3"), # ("l3", "l4"), # ("l1",)] npubs = 10000 kws, unique_kws = get_keywords(min=20) print "Total pubs:", len(kws) print "Unique keywords:", len(unique_kws) # print "\n".join(sorted(list(unique_kws)[:1000])) # sys.exit() pub_ids, labels = zip(*random.sample(kws.items(), npubs)) # pub_ids = kws.keys() # labels = kws.values() pubs = PubTexts() texts = pubs.texts(pub_ids, use_title=True, use_abs=True, use_body=False) print "Texts loaded" tfidf = TfidfVectorizer(ngram_range=(1, 1), stop_words="english") binarizer = MultiLabelBinarizer() x = tfidf.fit_transform(texts) y = binarizer.fit_transform(labels) print "TfIdf and labels calculated." del texts, labels, pub_ids clf = OneVsRestClassifier(LogisticRegression(), n_jobs=2) clf.fit(x, y) test = [ "The Case for Wireless Overlay Networks", "The Cost of Adaptivity and Virtual Lanes in a Wormhole Router", "Robust Monte Carlo Localization for Mobile Robots", "Generating Finite-State Transducers For Semi-Structured Data Extraction From The Web" ] test_x = tfidf.transform(test) test_y = clf.predict(test_x) print binarizer.inverse_transform(test_y)
class MultiLabelBinarizerImpl(): def __init__(self, classes=None, sparse_output=False): self._hyperparams = { 'classes': classes, 'sparse_output': sparse_output } def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def transform(self, X): return self._sklearn_model.transform(X)
def test_multilabel_binarizer_non_integer_labels(): tuple_classes = np.empty(3, dtype=object) tuple_classes[:] = [(1,), (2,), (3,)] inputs = [ ([('2', '3'), ('1',), ('1', '2')], ['1', '2', '3']), ([('b', 'c'), ('a',), ('a', 'b')], ['a', 'b', 'c']), ([((2,), (3,)), ((1,),), ((1,), (2,))], tuple_classes), ] indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]]) for inp, classes in inputs: # fit_transform() mlb = MultiLabelBinarizer() assert_array_equal(mlb.fit_transform(inp), indicator_mat) assert_array_equal(mlb.classes_, classes) assert_array_equal(mlb.inverse_transform(indicator_mat), inp) # fit().transform() mlb = MultiLabelBinarizer() assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat) assert_array_equal(mlb.classes_, classes) assert_array_equal(mlb.inverse_transform(indicator_mat), inp) mlb = MultiLabelBinarizer() assert_raises(TypeError, mlb.fit_transform, [({}), ({}, {'a': 'b'})])
def test_multilabel_binarizer_same_length_sequence(): # Ensure sequences of the same length are not interpreted as a 2-d array inp = [[1], [0], [2]] indicator_mat = np.array([[0, 1, 0], [1, 0, 0], [0, 0, 1]]) # fit_transform() mlb = MultiLabelBinarizer() assert_array_equal(mlb.fit_transform(inp), indicator_mat) assert_array_equal(mlb.inverse_transform(indicator_mat), inp) # fit().transform() mlb = MultiLabelBinarizer() assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat) assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
def test_multilabel_binarizer_inverse_validation(): inp = [(1, 1, 1, 0)] mlb = MultiLabelBinarizer() mlb.fit_transform(inp) # Not binary assert_raises(ValueError, mlb.inverse_transform, np.array([[1, 3]])) # The following binary cases are fine, however mlb.inverse_transform(np.array([[0, 0]])) mlb.inverse_transform(np.array([[1, 1]])) mlb.inverse_transform(np.array([[1, 0]])) # Wrong shape assert_raises(ValueError, mlb.inverse_transform, np.array([[1]])) assert_raises(ValueError, mlb.inverse_transform, np.array([[1, 1, 1]]))
def test_sparse_output_multilabel_binarizer(): # test input as iterable of iterables inputs = [ lambda: [(2, 3), (1,), (1, 2)], lambda: (set([2, 3]), set([1]), set([1, 2])), lambda: iter([iter((2, 3)), iter((1,)), set([1, 2])]), ] indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]]) inverse = inputs[0]() for sparse_output in [True, False]: for inp in inputs: # With fit_tranform mlb = MultiLabelBinarizer(sparse_output=sparse_output) got = mlb.fit_transform(inp()) assert_equal(issparse(got), sparse_output) if sparse_output: got = got.toarray() assert_array_equal(indicator_mat, got) assert_array_equal([1, 2, 3], mlb.classes_) assert_equal(mlb.inverse_transform(got), inverse) # With fit mlb = MultiLabelBinarizer(sparse_output=sparse_output) got = mlb.fit(inp()).transform(inp()) assert_equal(issparse(got), sparse_output) if sparse_output: got = got.toarray() assert_array_equal(indicator_mat, got) assert_array_equal([1, 2, 3], mlb.classes_) assert_equal(mlb.inverse_transform(got), inverse) assert_raises(ValueError, mlb.inverse_transform, csr_matrix(np.array([[0, 1, 1], [2, 0, 0], [1, 1, 0]])))
def test_multilabel_binarizer(): # test input as iterable of iterables inputs = [ lambda: [(2, 3), (1,), (1, 2)], lambda: ({2, 3}, {1}, {1, 2}), lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]), ] indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]]) inverse = inputs[0]() for inp in inputs: # With fit_transform mlb = MultiLabelBinarizer() got = mlb.fit_transform(inp()) assert_array_equal(indicator_mat, got) assert_array_equal([1, 2, 3], mlb.classes_) assert_equal(mlb.inverse_transform(got), inverse) # With fit mlb = MultiLabelBinarizer() got = mlb.fit(inp()).transform(inp()) assert_array_equal(indicator_mat, got) assert_array_equal([1, 2, 3], mlb.classes_) assert_equal(mlb.inverse_transform(got), inverse)
def test_sparse_output_multilabel_binarizer(): # test input as iterable of iterables inputs = [ lambda: [(2, 3), (1, ), (1, 2)], lambda: ({2, 3}, {1}, {1, 2}), lambda: iter([iter((2, 3)), iter((1, )), {1, 2}]), ] indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]]) inverse = inputs[0]() for sparse_output in [True, False]: for inp in inputs: # With fit_transform mlb = MultiLabelBinarizer(sparse_output=sparse_output) got = mlb.fit_transform(inp()) assert issparse(got) == sparse_output if sparse_output: # verify CSR assumption that indices and indptr have same dtype assert got.indices.dtype == got.indptr.dtype got = got.toarray() assert_array_equal(indicator_mat, got) assert_array_equal([1, 2, 3], mlb.classes_) assert mlb.inverse_transform(got) == inverse # With fit mlb = MultiLabelBinarizer(sparse_output=sparse_output) got = mlb.fit(inp()).transform(inp()) assert issparse(got) == sparse_output if sparse_output: # verify CSR assumption that indices and indptr have same dtype assert got.indices.dtype == got.indptr.dtype got = got.toarray() assert_array_equal(indicator_mat, got) assert_array_equal([1, 2, 3], mlb.classes_) assert mlb.inverse_transform(got) == inverse with pytest.raises(ValueError): mlb.inverse_transform( csr_matrix(np.array([[0, 1, 1], [2, 0, 0], [1, 1, 0]])))
def test_multilabel_binarizer_inverse_validation(): inp = [(1, 1, 1, 0)] mlb = MultiLabelBinarizer() mlb.fit_transform(inp) # Not binary assert_raises(ValueError, mlb.inverse_transform, np.array([[1, 3]])) # The following binary cases are fine, however mlb.inverse_transform(np.array([[0, 0]])) mlb.inverse_transform(np.array([[1, 1]])) mlb.inverse_transform(np.array([[1, 0]])) # Wrong shape assert_raises(ValueError, mlb.inverse_transform, np.array([[1]])) assert_raises(ValueError, mlb.inverse_transform, np.array([[1, 1, 1]]))
def test_multilabel_binarizer_same_length_sequence(): # Ensure sequences of the same length are not interpreted as a 2-d array inp = [[1], [0], [2]] indicator_mat = np.array([[0, 1, 0], [1, 0, 0], [0, 0, 1]]) # fit_transform() mlb = MultiLabelBinarizer() assert_array_equal(mlb.fit_transform(inp), indicator_mat) assert_array_equal(mlb.inverse_transform(indicator_mat), inp) # fit().transform() mlb = MultiLabelBinarizer() assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat) assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
def test_multilabel_binarizer_unknown_class(): mlb = MultiLabelBinarizer() y = [[1, 2]] Y = np.array([[1, 0], [0, 1]]) w = 'unknown class(es) [0, 4] will be ignored' matrix = assert_warns_message(UserWarning, w, mlb.fit(y).transform, [[4, 1], [2, 0]]) assert_array_equal(matrix, Y) Y = np.array([[1, 0, 0], [0, 1, 0]]) mlb = MultiLabelBinarizer(classes=[1, 2, 3]) matrix = assert_warns_message(UserWarning, w, mlb.fit(y).transform, [[4, 1], [2, 0]]) assert_array_equal(matrix, Y)
def test_sparse_output_multilabel_binarizer(): # test input as iterable of iterables inputs = [ lambda: [(2, 3), (1,), (1, 2)], lambda: ({2, 3}, {1}, {1, 2}), lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]), ] indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]]) inverse = inputs[0]() for sparse_output in [True, False]: for inp in inputs: # With fit_transform mlb = MultiLabelBinarizer(sparse_output=sparse_output) got = mlb.fit_transform(inp()) assert_equal(issparse(got), sparse_output) if sparse_output: # verify CSR assumption that indices and indptr have same dtype assert_equal(got.indices.dtype, got.indptr.dtype) got = got.toarray() assert_array_equal(indicator_mat, got) assert_array_equal([1, 2, 3], mlb.classes_) assert_equal(mlb.inverse_transform(got), inverse) # With fit mlb = MultiLabelBinarizer(sparse_output=sparse_output) got = mlb.fit(inp()).transform(inp()) assert_equal(issparse(got), sparse_output) if sparse_output: # verify CSR assumption that indices and indptr have same dtype assert_equal(got.indices.dtype, got.indptr.dtype) got = got.toarray() assert_array_equal(indicator_mat, got) assert_array_equal([1, 2, 3], mlb.classes_) assert_equal(mlb.inverse_transform(got), inverse) assert_raises(ValueError, mlb.inverse_transform, csr_matrix(np.array([[0, 1, 1], [2, 0, 0], [1, 1, 0]])))
def test_multilabel_binarizer_non_integer_labels(): tuple_classes = np.empty(3, dtype=object) tuple_classes[:] = [(1,), (2,), (3,)] inputs = [ ([("2", "3"), ("1",), ("1", "2")], ["1", "2", "3"]), ([("b", "c"), ("a",), ("a", "b")], ["a", "b", "c"]), ([((2,), (3,)), ((1,),), ((1,), (2,))], tuple_classes), ] indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]]) for inp, classes in inputs: # fit_transform() mlb = MultiLabelBinarizer() assert_array_equal(mlb.fit_transform(inp), indicator_mat) assert_array_equal(mlb.classes_, classes) assert_array_equal(mlb.inverse_transform(indicator_mat), inp) # fit().transform() mlb = MultiLabelBinarizer() assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat) assert_array_equal(mlb.classes_, classes) assert_array_equal(mlb.inverse_transform(indicator_mat), inp) mlb = MultiLabelBinarizer() assert_raises(TypeError, mlb.fit_transform, [({}), ({}, {"a": "b"})])
def test_multilabel_binarizer_non_unique(): inp = [(1, 1, 1, 0)] indicator_mat = np.array([[1, 1]]) mlb = MultiLabelBinarizer() assert_array_equal(mlb.fit_transform(inp), indicator_mat)
def __init__(self, classes=None, sparse_output=False): self._hyperparams = { 'classes': classes, 'sparse_output': sparse_output } self._wrapped_model = Op(**self._hyperparams)
def __init__(self, sparse_output=False): self.transformer = MultiLabelBinarizer() self.sparse_output = sparse_output
nltk.download('punkt') doc2vec_model_location = 'model/doc2vec-model.bin' classifier_model_location = 'model/classifier-model.bin' # Use the doc2vec model created in reuters-classifier-train.py doc2vec = Doc2Vec.load(doc2vec_model_location) # Load the test articles and convert it to doc vectors test_articles = [{'raw': reuters.raw(fileId), 'categories': reuters.categories(fileId)} for fileId in reuters.fileids() if fileId.startswith('test/')] test_data = [doc2vec.infer_vector(word_tokenize(article['raw'])) for article in test_articles] # Initialize the neural network model=load_model(classifier_model_location) # Make predictions predictions = model.predict(numpy.asarray(test_data)) # Convert the prediction with gives a value between 0 and 1 to exactly 0 or 1 with a threshold predictions[predictions<0.5] = 0 predictions[predictions>=0.5] = 1 # Convert predicted classes back to category names labelBinarizer = MultiLabelBinarizer() labelBinarizer.fit([reuters.categories(fileId) for fileId in reuters.fileids()]) predicted_labels = labelBinarizer.inverse_transform(predictions) for predicted_label, test_article in zip(predicted_labels, test_articles): print('title: {}'.format(test_article['raw'].splitlines()[0])) print('predicted: {} - actual: {}'.format(list(predicted_label), test_article['categories'])) print('')
from nltk.corpus import reuters from nltk.tokenize import word_tokenize from sklearn.preprocessing.label import MultiLabelBinarizer nltk.download('reuters') nltk.download('punkt') google_news_word2vec_model_location = 'data/GoogleNews-vectors-negative300.bin.gz' doc2vec_model_location = 'model/doc2vec-model.bin' doc2vec_dimensions = 300 classifier_model_location = 'model/classifier-model.bin' doc2vec = Doc2Vec.load(doc2vec_model_location) # Convert the categories to one hot encoded categories labelBinarizer = MultiLabelBinarizer() labelBinarizer.fit([reuters.categories(fileId) for fileId in reuters.fileids()]) # Convert load the articles with their corresponding categories train_articles = [{'raw': reuters.raw(fileId), 'categories': reuters.categories(fileId)} for fileId in reuters.fileids() if fileId.startswith('training/')] test_articles = [{'raw': reuters.raw(fileId), 'categories': reuters.categories(fileId)} for fileId in reuters.fileids() if fileId.startswith('test/')] shuffle(train_articles) shuffle(test_articles) # Convert the articles to document vectors using the doc2vec model train_data = [doc2vec.infer_vector(word_tokenize(article['raw'])) for article in train_articles] test_data = [doc2vec.infer_vector(word_tokenize(article['raw'])) for article in test_articles] train_labels = labelBinarizer.transform([article['categories'] for article in train_articles]) test_labels = labelBinarizer.transform([article['categories'] for article in test_articles]) train_data, test_data, train_labels, test_labels = numpy.asarray(train_data), numpy.asarray(test_data), numpy.asarray(train_labels), numpy.asarray(test_labels)
def test_multilabel_binarizer_given_classes(): inp = [(2, 3), (1,), (1, 2)] indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1]]) # fit_transform() mlb = MultiLabelBinarizer(classes=[1, 3, 2]) assert_array_equal(mlb.fit_transform(inp), indicator_mat) assert_array_equal(mlb.classes_, [1, 3, 2]) # fit().transform() mlb = MultiLabelBinarizer(classes=[1, 3, 2]) assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat) assert_array_equal(mlb.classes_, [1, 3, 2]) # ensure works with extra class mlb = MultiLabelBinarizer(classes=[4, 1, 3, 2]) assert_array_equal(mlb.fit_transform(inp), np.hstack(([[0], [0], [0]], indicator_mat))) assert_array_equal(mlb.classes_, [4, 1, 3, 2]) # ensure fit is no-op as iterable is not consumed inp = iter(inp) mlb = MultiLabelBinarizer(classes=[1, 3, 2]) assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat) # ensure a ValueError is thrown if given duplicate classes err_msg = "The classes argument contains duplicate classes. Remove " \ "these duplicates before passing them to MultiLabelBinarizer." mlb = MultiLabelBinarizer(classes=[1, 3, 2, 3]) assert_raise_message(ValueError, err_msg, mlb.fit, inp)
from nltk.corpus import reuters from nltk.tokenize import word_tokenize from sklearn.preprocessing.label import MultiLabelBinarizer nltk.download('reuters') nltk.download('punkt') google_news_word2vec_model_location = 'data/GoogleNews-vectors-negative300.bin.gz' doc2vec_model_location = 'model/doc2vec-model.bin' doc2vec_dimensions = 300 classifier_model_location = 'model/classifier-model.bin' doc2vec = Doc2Vec.load(doc2vec_model_location) # Convert the categories to one hot encoded categories labelBinarizer = MultiLabelBinarizer() labelBinarizer.fit( [reuters.categories(fileId) for fileId in reuters.fileids()]) # Convert load the articles with their corresponding categories train_articles = [{ 'raw': reuters.raw(fileId), 'categories': reuters.categories(fileId) } for fileId in reuters.fileids() if fileId.startswith('training/')] test_articles = [{ 'raw': reuters.raw(fileId), 'categories': reuters.categories(fileId) } for fileId in reuters.fileids() if fileId.startswith('test/')] shuffle(train_articles) shuffle(test_articles)
def test_multilabel_binarizer_given_classes(): inp = [(2, 3), (1,), (1, 2)] indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1]]) # fit_transform() mlb = MultiLabelBinarizer(classes=[1, 3, 2]) assert_array_equal(mlb.fit_transform(inp), indicator_mat) assert_array_equal(mlb.classes_, [1, 3, 2]) # fit().transform() mlb = MultiLabelBinarizer(classes=[1, 3, 2]) assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat) assert_array_equal(mlb.classes_, [1, 3, 2]) # ensure works with extra class mlb = MultiLabelBinarizer(classes=[4, 1, 3, 2]) assert_array_equal(mlb.fit_transform(inp), np.hstack(([[0], [0], [0]], indicator_mat))) assert_array_equal(mlb.classes_, [4, 1, 3, 2]) # ensure fit is no-op as iterable is not consumed inp = iter(inp) mlb = MultiLabelBinarizer(classes=[1, 3, 2]) assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
def test_multilabel_binarizer_non_unique(): inp = [(1, 1, 1, 0)] indicator_mat = np.array([[1, 1]]) mlb = MultiLabelBinarizer() assert_array_equal(mlb.fit_transform(inp), indicator_mat)