def __init__(self, n, token_vocab, *args, **kwargs): """""" recount = kwargs.pop('recount', False) initialize_zero = kwargs.pop('initialize_zero', False) super(TokenVocab, self).__init__(*args, **kwargs) self._n = n self._token_vocab = token_vocab self._token_counts = Counter() self._subtoken_vocab = CharVocab.from_vocab(self.token_vocab) self._multibucket = Multibucket.from_configurable( self, embed_model=self.embed_model, name=self.name) if recount: self.count() else: if os.path.isfile(self.filename): self.load() else: self.count() self.dump() self.index_vocab() embed_dims = [len(self), self.embed_size] if initialize_zero: self.embeddings = np.zeros(embed_dims) else: self.embeddings = np.random.randn(*embed_dims) return
def __setattr__(self, key, value): if key == '_vocabs': conll_idxs = set([ vocab.conll_idx for vocab in value if hasattr(vocab, 'conll_idx') ]) assert len(conll_idxs) == 1 self._conll_idx = list(conll_idxs)[0] super(Multivocab, self).__setattr__(key, value) #*************************************************************** if __name__ == '__main__': """""" from parser.vocabs import PretrainedVocab, WordVocab, CharVocab, Multivocab configurable = Configurable() token_vocab = WordVocab.from_configurable(configurable) pretrained_vocab = PretrainedVocab.from_vocab(token_vocab) subtoken_vocab = CharVocab.from_vocab(token_vocab) multivocab = Multivocab.from_configurable( configurable, [pretrained_vocab, token_vocab, subtoken_vocab]) multivocab.add_files(configurable.valid_files) multivocab.index_tokens() print("Indices for '<PAD>': %s" % str(multivocab.index('<PAD>'))) print("Indices for 'the': %s" % str(multivocab.index('the'))) print("Indices for 'The': %s" % str(multivocab.index('The'))) print('Multivocab passes')