def __init__(self, vocabs, *args, **kwargs): """""" # nlp_model = Parser nlp_model = kwargs.pop('nlp_model', None) #print ("---dataset.py---\n",nlp_model) super(Dataset, self).__init__(*args, **kwargs) self._vocabs = vocabs self._multibuckets = [Multibucket.from_configurable(vocab, name='%s-%s'%(self.name, vocab.name)) for vocab in self.vocabs] if nlp_model is not None: self._nlp_model = nlp_model.from_configurable(self, name=self.name) else: self._nlp_model = None #print ("---dataset.py---after\n",nlp_model) with Bucketer.from_configurable(self, self.n_buckets, name='bucketer-%s'%self.name) as bucketer: splits = bucketer.compute_splits(len(sent) for sent in self.iterfiles()) for i in xrange(len(splits)): splits[i] += 1 for multibucket, vocab in self.iteritems(): multibucket.open(splits, depth=vocab.depth) for sent in self.iterfiles(): for multibucket, vocab in self.iteritems(): tokens = [line[vocab.conll_idx] for line in sent] idxs = [vocab.ROOT] + [vocab.index(token) for token in tokens] multibucket.add(idxs, tokens) for multibucket in self: multibucket.close() self._multibucket = Multibucket.from_dataset(self) return
def __init__(self, token_vocab, *args, **kwargs): """""" recount = kwargs.pop('recount', False) initialize_zero = kwargs.pop('initialize_zero', False) super(TokenVocab, self).__init__(*args, **kwargs) self._token_vocab = token_vocab self._token_counts = Counter() self._multibucket = Multibucket.from_configurable( self, embed_model=self.embed_model, name=self.name) self._tok2idx = {} if recount: self.count() else: if os.path.isfile(self.filename): self.load() else: self.count() self.dump() self.index_vocab() embed_dims = [len(self), self.embed_size] if initialize_zero: self.embeddings = np.zeros(embed_dims) else: self.embeddings = np.random.randn(*embed_dims) return
def __init__(self, token_vocab, *args, **kwargs): """""" super(BaseVocab, self).__init__(*args, **kwargs) self._cased = super(BaseVocab, self).cased SubtokenVocab.__setattr__(self, '_token_vocab', token_vocab) self._multibucket = Multibucket.from_configurable(self, embed_model=self.embed_model, name=self.name) self._vocabs = [NgramVocab.from_vocab(self.token_vocab, i+1, cased=self.cased) for i in xrange(self.max_n)] self._special_tokens = super(BaseVocab, self).special_tokens self._special_tokens_set = set(self._special_tokens) SubtokenVocab._set_special_tokens(self) self._tok2idx = {} for vocab in self: assert vocab.token_vocab is self.token_vocab return