Beispiel #1
0
    def __init__(self, n, token_vocab, *args, **kwargs):
        """"""

        recount = kwargs.pop('recount', False)
        initialize_zero = kwargs.pop('initialize_zero', False)
        super(TokenVocab, self).__init__(*args, **kwargs)

        self._n = n
        self._token_vocab = token_vocab
        self._token_counts = Counter()
        self._subtoken_vocab = CharVocab.from_vocab(self.token_vocab)
        self._multibucket = Multibucket.from_configurable(
            self, embed_model=self.embed_model, name=self.name)

        if recount:
            self.count()
        else:
            if os.path.isfile(self.filename):
                self.load()
            else:
                self.count()
                self.dump()
        self.index_vocab()

        embed_dims = [len(self), self.embed_size]
        if initialize_zero:
            self.embeddings = np.zeros(embed_dims)
        else:
            self.embeddings = np.random.randn(*embed_dims)
        return
Beispiel #2
0
    def __setattr__(self, key, value):
        if key == '_vocabs':
            conll_idxs = set([
                vocab.conll_idx for vocab in value
                if hasattr(vocab, 'conll_idx')
            ])
            assert len(conll_idxs) == 1
            self._conll_idx = list(conll_idxs)[0]
        super(Multivocab, self).__setattr__(key, value)


#***************************************************************
if __name__ == '__main__':
    """"""

    from parser.vocabs import PretrainedVocab, WordVocab, CharVocab, Multivocab

    configurable = Configurable()
    token_vocab = WordVocab.from_configurable(configurable)
    pretrained_vocab = PretrainedVocab.from_vocab(token_vocab)
    subtoken_vocab = CharVocab.from_vocab(token_vocab)
    multivocab = Multivocab.from_configurable(
        configurable, [pretrained_vocab, token_vocab, subtoken_vocab])
    multivocab.add_files(configurable.valid_files)
    multivocab.index_tokens()
    print("Indices for '<PAD>': %s" % str(multivocab.index('<PAD>')))
    print("Indices for 'the': %s" % str(multivocab.index('the')))
    print("Indices for 'The': %s" % str(multivocab.index('The')))
    print('Multivocab passes')