def fit(self, X, y, y_class = None): """Learn vocabulary from training set. Args: X : iterable. An iterable which yields either str, unicode or file objects. Returns: self : IndexTransformer. """ self._word_vocab.add_documents(X) self._label_vocab.add_documents(y) if self._use_char: for doc in X: self._char_vocab.add_documents(doc) if y_class is not None: for cat in y_class: cat_vocab = Vocabulary(lower=False, unk_token='UNK') cat_vocab.add_documents(cat) cat_vocab.build() print(cat_vocab.vocab) self._label_class_vocab.append(cat_vocab) self._word_vocab.build() self._char_vocab.build() self._label_vocab.build() return self
def test_add_documents(self): # word vocabulary. docs = [['a'], ['a', 'b'], ['a', 'b', 'c']] token2id = {'<pad>': 0, 'a': 1, 'b': 2, 'c': 3, '<unk>': 4} vocab = Vocabulary() vocab.add_documents(docs) vocab.build() self.assertEqual(vocab._token2id, token2id) token2id = {'<pad>': 0, 'a': 1, 'b': 2, 'c': 3} vocab = Vocabulary(unk_token=False) vocab.add_documents(docs) vocab.build() self.assertEqual(vocab._token2id, token2id) token2id = {'<pad>': 0, '<s>': 1, 'a': 2, 'b': 3, 'c': 4} vocab = Vocabulary(unk_token=False, specials=('<pad>', '<s>')) vocab.add_documents(docs) vocab.build() self.assertEqual(vocab._token2id, token2id) token2id = {'a': 0, 'b': 1, 'c': 2} vocab = Vocabulary(unk_token=False, specials=()) vocab.add_documents(docs) vocab.build() self.assertEqual(vocab._token2id, token2id) # char vocabulary. docs = ['hoge', 'fuga', 'bar'] vocab = Vocabulary() vocab.add_documents(docs) vocab.build() num_chars = len(set(''.join(docs))) + 2 self.assertEqual(len(vocab._token2id), num_chars)
def test_id2doc(self): # word ids. docs = [['B-PSN'], ['B-ORG', 'I-ORG'], ['B-LOC', 'I-LOC', 'O']] vocab = Vocabulary(unk_token=False, lower=False) vocab.add_documents(docs) vocab.build() true_doc = ['O', 'B-LOC', 'O', 'O'] doc_ids = vocab.doc2id(true_doc) pred_doc = vocab.id2doc(doc_ids) self.assertEqual(pred_doc, true_doc)
def test_doc2id(self): # word ids. docs = [['a'], ['a', 'b'], ['a', 'b', 'c']] vocab = Vocabulary() vocab.add_documents(docs) vocab.build() another_doc = ['a', 'b', 'c', 'd'] doc_ids = vocab.doc2id(another_doc) self.assertEqual(doc_ids, [1, 2, 3, 4]) # char_ids. docs = ['hoge', 'fuga', 'bar'] vocab = Vocabulary() vocab.add_documents(docs) vocab.build() doc_ids = vocab.doc2id(docs[0]) correct = [vocab.token_to_id(c) for c in docs[0]] self.assertEqual(doc_ids, correct)
class IndexTransformer(BaseEstimator, TransformerMixin): """Convert a collection of raw documents to a document id matrix. Attributes: _use_char: boolean. Whether to use char feature. _num_norm: boolean. Whether to normalize text. _word_vocab: dict. A mapping of words to feature indices. _char_vocab: dict. A mapping of chars to feature indices. _label_vocab: dict. A mapping of labels to feature indices. """ def __init__(self, lower=True, num_norm=True, use_char=True, initial_vocab=None): """Create a preprocessor object. Args: lower: boolean. Whether to convert the texts to lowercase. use_char: boolean. Whether to use char feature. num_norm: boolean. Whether to normalize text. initial_vocab: Iterable. Initial vocabulary for expanding word_vocab. """ self._num_norm = num_norm self._use_char = use_char self._word_vocab = Vocabulary(lower=lower) self._char_vocab = Vocabulary(lower=False) self._label_vocab = Vocabulary(lower=False, unk_token=False) if initial_vocab: self._word_vocab.add_documents([initial_vocab]) self._char_vocab.add_documents(initial_vocab) def fit(self, X, y): """Learn vocabulary from training set. Args: X : iterable. An iterable which yields either str, unicode or file objects. Returns: self : IndexTransformer. """ self._word_vocab.add_documents(X) self._label_vocab.add_documents(y) if self._use_char: for doc in X: self._char_vocab.add_documents(doc) self._word_vocab.build() self._char_vocab.build() self._label_vocab.build() return self def transform(self, X, y=None): """Transform documents to document ids. Uses the vocabulary learned by fit. Args: X : iterable an iterable which yields either str, unicode or file objects. y : iterabl, label strings. Returns: features: document id matrix. y: label id matrix. """ word_ids = [self._word_vocab.doc2id(doc) for doc in X] word_ids = pad_sequences(word_ids, padding='post') if self._use_char: char_ids = [[self._char_vocab.doc2id(w) for w in doc] for doc in X] char_ids = pad_nested_sequences(char_ids) features = [word_ids, char_ids] else: features = word_ids if y is not None: y = [self._label_vocab.doc2id(doc) for doc in y] y = pad_sequences(y, padding='post') y = to_categorical(y, self.label_size).astype(int) # In 2018/06/01, to_categorical is a bit strange. # >>> to_categorical([[1,3]], num_classes=4).shape # (1, 2, 4) # >>> to_categorical([[1]], num_classes=4).shape # (1, 4) # So, I expand dimensions when len(y.shape) == 2. y = y if len(y.shape) == 3 else np.expand_dims(y, axis=0) return features, y else: return features def fit_transform(self, X, y=None, **params): """Learn vocabulary and return document id matrix. This is equivalent to fit followed by transform. Args: X : iterable an iterable which yields either str, unicode or file objects. Returns: list : document id matrix. list: label id matrix. """ return self.fit(X, y).transform(X, y) def inverse_transform(self, y, lengths=None): """Return label strings. Args: y: label id matrix. lengths: sentences length. Returns: list: list of list of strings. """ y = np.argmax(y, -1) inverse_y = [self._label_vocab.id2doc(ids) for ids in y] if lengths is not None: inverse_y = [iy[:l] for iy, l in zip(inverse_y, lengths)] return inverse_y @property def word_vocab_size(self): return len(self._word_vocab) @property def char_vocab_size(self): return len(self._char_vocab) @property def label_size(self): return len(self._label_vocab) def save(self, file_path): joblib.dump(self, file_path) @classmethod def load(cls, file_path): p = joblib.load(file_path) return p