Ejemplo n.º 1
0
 def test_id2doc(self):
     # word ids.
     docs = [['B-PSN'], ['B-ORG', 'I-ORG'], ['B-LOC', 'I-LOC', 'O']]
     vocab = Vocabulary(unk_token=False, lower=False)
     vocab.add_documents(docs)
     vocab.build()
     true_doc = ['O', 'B-LOC', 'O', 'O']
     doc_ids = vocab.doc2id(true_doc)
     pred_doc = vocab.id2doc(doc_ids)
     self.assertEqual(pred_doc, true_doc)
Ejemplo n.º 2
0
    def fit(self, X, y, y_class = None):
        """Learn vocabulary from training set.

        Args:
            X : iterable. An iterable which yields either str, unicode or file objects.

        Returns:
            self : IndexTransformer.
        """
        self._word_vocab.add_documents(X)
        self._label_vocab.add_documents(y)
        if self._use_char:
            for doc in X:
                self._char_vocab.add_documents(doc)
                
        if y_class is not None: 
            for cat in y_class:
                cat_vocab = Vocabulary(lower=False, unk_token='UNK')
                cat_vocab.add_documents(cat)
                cat_vocab.build()
                print(cat_vocab.vocab)
                self._label_class_vocab.append(cat_vocab)

        self._word_vocab.build()
        self._char_vocab.build()
        self._label_vocab.build()

        return self
Ejemplo n.º 3
0
    def __init__(self,
                 lower=True,
                 num_norm=True,
                 use_char=True,
                 initial_vocab=None,
                 max_size=None,
                 min_freq=1,
                 alphanumerical=False,
                 max_sent_len=100,
                 max_word_len=20):
        """Create a preprocessor object.

        Args:
            lower: boolean. Whether to convert the texts to lowercase.
            use_char: boolean. Whether to use char feature.
            num_norm: boolean. Whether to normalize text.
            initial_vocab: Iterable. Initial vocabulary for expanding word_vocab.
        """
        self._num_norm = num_norm
        self._use_char = use_char
        self._word_vocab = Vocabulary(lower=lower,
                                      max_size=max_size,
                                      min_freq=min_freq,
                                      alphanumerical=alphanumerical)
        self._char_vocab = Vocabulary(lower=False)
        self._label_vocab = Vocabulary(lower=False, unk_token=False)

        # currently not implemented
        self._max_sent_len = max_sent_len
        self._max_word_len = max_word_len

        if initial_vocab:
            self._word_vocab.add_documents([initial_vocab])
            self._char_vocab.add_documents(initial_vocab)
Ejemplo n.º 4
0
    def __init__(self, lower=True, num_norm=True,
                 use_char=True, initial_vocab=None):
        """Create a preprocessor object.

        Args:
            lower: boolean. Whether to convert the texts to lowercase.
            use_char: boolean. Whether to use char feature.
            num_norm: boolean. Whether to normalize text.
            initial_vocab: Iterable. Initial vocabulary for expanding word_vocab.
        """
        self._num_norm = num_norm
        self._use_char = use_char
        self._word_vocab = Vocabulary(lower=lower)
        self._char_vocab = Vocabulary(lower=False)
        self._label_vocab = Vocabulary(lower=False, unk_token=False)

        if initial_vocab:
            self._word_vocab.add_documents([initial_vocab])
            self._char_vocab.add_documents(initial_vocab)
Ejemplo n.º 5
0
class IndexTransformer(BaseEstimator, TransformerMixin):
    """Convert a collection of raw documents to a document id matrix.

    Attributes:
        _use_char: boolean. Whether to use char feature.
        _num_norm: boolean. Whether to normalize text.
        _word_vocab: dict. A mapping of words to feature indices.
        _char_vocab: dict. A mapping of chars to feature indices.
        _label_vocab: dict. A mapping of labels to feature indices.
    """

    def __init__(self, lower=True, num_norm=True,
                 use_char=True, initial_vocab=None):
        """Create a preprocessor object.

        Args:
            lower: boolean. Whether to convert the texts to lowercase.
            use_char: boolean. Whether to use char feature.
            num_norm: boolean. Whether to normalize text.
            initial_vocab: Iterable. Initial vocabulary for expanding word_vocab.
        """
        self._num_norm = num_norm
        self._use_char = use_char
        self._word_vocab = Vocabulary(lower=lower)
        self._char_vocab = Vocabulary(lower=False)
        self._label_vocab = Vocabulary(lower=False, unk_token=False)

        if initial_vocab:
            self._word_vocab.add_documents([initial_vocab])
            self._char_vocab.add_documents(initial_vocab)

    def fit(self, X, y):
        """Learn vocabulary from training set.

        Args:
            X : iterable. An iterable which yields either str, unicode or file objects.

        Returns:
            self : IndexTransformer.
        """
        self._word_vocab.add_documents(X)
        self._label_vocab.add_documents(y)
        if self._use_char:
            for doc in X:
                self._char_vocab.add_documents(doc)

        self._word_vocab.build()
        self._char_vocab.build()
        self._label_vocab.build()

        return self

    def transform(self, X, y=None):
        """Transform documents to document ids.

        Uses the vocabulary learned by fit.

        Args:
            X : iterable
            an iterable which yields either str, unicode or file objects.
            y : iterabl, label strings.

        Returns:
            features: document id matrix.
            y: label id matrix.
        """
        word_ids = [self._word_vocab.doc2id(doc) for doc in X]
        word_ids = pad_sequences(word_ids, padding='post')

        if self._use_char:
            char_ids = [[self._char_vocab.doc2id(w) for w in doc] for doc in X]
            char_ids = pad_nested_sequences(char_ids)
            features = [word_ids, char_ids]
        else:
            features = word_ids

        if y is not None:
            y = [self._label_vocab.doc2id(doc) for doc in y]
            y = pad_sequences(y, padding='post')
            y = to_categorical(y, self.label_size).astype(int)
            # In 2018/06/01, to_categorical is a bit strange.
            # >>> to_categorical([[1,3]], num_classes=4).shape
            # (1, 2, 4)
            # >>> to_categorical([[1]], num_classes=4).shape
            # (1, 4)
            # So, I expand dimensions when len(y.shape) == 2.
            y = y if len(y.shape) == 3 else np.expand_dims(y, axis=0)
            return features, y
        else:
            return features

    def fit_transform(self, X, y=None, **params):
        """Learn vocabulary and return document id matrix.

        This is equivalent to fit followed by transform.

        Args:
            X : iterable
            an iterable which yields either str, unicode or file objects.

        Returns:
            list : document id matrix.
            list: label id matrix.
        """
        return self.fit(X, y).transform(X, y)

    def inverse_transform(self, y, lengths=None):
        """Return label strings.

        Args:
            y: label id matrix.
            lengths: sentences length.

        Returns:
            list: list of list of strings.
        """
        y = np.argmax(y, -1)
        inverse_y = [self._label_vocab.id2doc(ids) for ids in y]
        if lengths is not None:
            inverse_y = [iy[:l] for iy, l in zip(inverse_y, lengths)]

        return inverse_y

    @property
    def word_vocab_size(self):
        return len(self._word_vocab)

    @property
    def char_vocab_size(self):
        return len(self._char_vocab)

    @property
    def label_size(self):
        return len(self._label_vocab)

    def save(self, file_path):
        joblib.dump(self, file_path)

    @classmethod
    def load(cls, file_path):
        p = joblib.load(file_path)

        return p
Ejemplo n.º 6
0
    def test_doc2id(self):
        # word ids.
        docs = [['a'], ['a', 'b'], ['a', 'b', 'c']]
        vocab = Vocabulary()
        vocab.add_documents(docs)
        vocab.build()
        another_doc = ['a', 'b', 'c', 'd']
        doc_ids = vocab.doc2id(another_doc)
        self.assertEqual(doc_ids, [1, 2, 3, 4])

        # char_ids.
        docs = ['hoge', 'fuga', 'bar']
        vocab = Vocabulary()
        vocab.add_documents(docs)
        vocab.build()
        doc_ids = vocab.doc2id(docs[0])
        correct = [vocab.token_to_id(c) for c in docs[0]]
        self.assertEqual(doc_ids, correct)
Ejemplo n.º 7
0
    def test_add_documents(self):
        # word vocabulary.
        docs = [['a'], ['a', 'b'], ['a', 'b', 'c']]
        token2id = {'<pad>': 0, 'a': 1, 'b': 2, 'c': 3, '<unk>': 4}
        vocab = Vocabulary()
        vocab.add_documents(docs)
        vocab.build()
        self.assertEqual(vocab._token2id, token2id)

        token2id = {'<pad>': 0, 'a': 1, 'b': 2, 'c': 3}
        vocab = Vocabulary(unk_token=False)
        vocab.add_documents(docs)
        vocab.build()
        self.assertEqual(vocab._token2id, token2id)

        token2id = {'<pad>': 0, '<s>': 1, 'a': 2, 'b': 3, 'c': 4}
        vocab = Vocabulary(unk_token=False, specials=('<pad>', '<s>'))
        vocab.add_documents(docs)
        vocab.build()
        self.assertEqual(vocab._token2id, token2id)

        token2id = {'a': 0, 'b': 1, 'c': 2}
        vocab = Vocabulary(unk_token=False, specials=())
        vocab.add_documents(docs)
        vocab.build()
        self.assertEqual(vocab._token2id, token2id)

        # char vocabulary.
        docs = ['hoge', 'fuga', 'bar']
        vocab = Vocabulary()
        vocab.add_documents(docs)
        vocab.build()
        num_chars = len(set(''.join(docs))) + 2
        self.assertEqual(len(vocab._token2id), num_chars)