def reinit(self, vocabs, parse_files): """ """ self.preopen_parse_file = parse_files self._vocabs = vocabs self._multibuckets = [ Multibucket.from_configurable(vocab, name='%s-%s' % (self.name, vocab.name)) for vocab in self.vocabs ] self._metadata = [] with Bucketer.from_configurable(self, self.n_buckets, name='bucketer-%s' % self.name) as bucketer: splits = bucketer.compute_splits( len(sent) for sent, metadata in self.iterfiles()) for i in range(len(splits)): splits[i] += 1 for multibucket, vocab in self.iteritems(): multibucket.open(splits, depth=vocab.depth) for sent, metadata in self.iterfiles(): self._metadata.append(metadata) for multibucket, vocab in self.iteritems(): tokens = [line[vocab.conll_idx] for line in sent] idxs = [vocab.ROOT] + [vocab.index(token) for token in tokens] multibucket.add(idxs, tokens) for multibucket in self: multibucket.close() self._multibucket = Multibucket.from_dataset(self) return
def index_tokens(self): """ """ n_buckets = self.n_buckets tok2idxs = { token: [vocab.subtoken_indices(token) for vocab in self] for token in self.token_vocab.counts } with Bucketer.from_configurable(self, self.n_buckets, name='bucketer-%s' % self.name) as bucketer: splits = bucketer.compute_splits( len(indices[0]) for indices in list(tok2idxs.values())) bucketer.plot() with self.multibucket.open(splits, depth=len(self)): for index, special_token in enumerate(self.special_tokens): self.tok2idx[special_token] = self.multibucket.add([[index] * len(self)]) for token, _ in self.sorted_counts(self.token_vocab.counts): indices = tok2idxs[token] sequence = [[ indices[i][j] for i in range(len(indices)) if j < len(indices[i]) ] for j in range(len(indices[0]))] self.tok2idx[token] = self.multibucket.add(sequence) return
def __init__(self, vocabs, *args, **kwargs): """ """ nlp_model = kwargs.pop('nlp_model', None) if "parse_files" in kwargs and isinstance( kwargs["parse_files"], io.StringIO): ### SPECIAL CASE - PARSING StringIO self.preopen_parse_file = kwargs.pop( "parse_files" ) #This doesn't really play well with the configparser thing else: self.preopen_parse_file = None super(Dataset, self).__init__(*args, **kwargs) self._vocabs = vocabs self._multibuckets = [ Multibucket.from_configurable(vocab, name='%s-%s' % (self.name, vocab.name)) for vocab in self.vocabs ] self._metadata = [] if nlp_model is not None: self._nlp_model = nlp_model.from_configurable(self, name=self.name) else: self._nlp_model = None with Bucketer.from_configurable(self, self.n_buckets, name='bucketer-%s' % self.name) as bucketer: splits = bucketer.compute_splits( len(sent) for sent, metadata in self.iterfiles()) for i in range(len(splits)): splits[i] += 1 for multibucket, vocab in self.iteritems(): multibucket.open(splits, depth=vocab.depth) for sent, metadata in self.iterfiles(): self._metadata.append(metadata) for multibucket, vocab in self.iteritems(): tokens = [line[vocab.conll_idx] for line in sent] idxs = [vocab.ROOT] + [vocab.index(token) for token in tokens] multibucket.add(idxs, tokens) for multibucket in self: multibucket.close() self._multibucket = Multibucket.from_dataset(self) return
def index_tokens(self): """ """ self._tok2idx = {} tok2idxs = {token: self.subtoken_indices(token) for token in self.token_vocab.counts} with Bucketer.from_configurable(self, self.n_buckets, name='bucketer-%s'%self.name) as bucketer: splits = bucketer.compute_splits(len(indices) for indices in list(tok2idxs.values())) with self.multibucket.open(splits): for index, special_token in enumerate(self.token_vocab.special_tokens): index = index if index != self.token_vocab.UNK else self.META_UNK self.tok2idx[special_token] = self.multibucket.add([index]) for token, _ in self.sorted_counts(self.token_vocab.counts): self.tok2idx[token] = self.multibucket.add(tok2idxs[token]) self._idx2tok = {idx: tok for tok, idx in self.tok2idx.items()} self._idx2tok[0] = self[self.PAD] return
return self._splits != value def __enter__(self): self._len2cnt = {} self._lengths = [] self._counts = [] self._splits = [] self._lidxs = [] return self def __exit__(self, exception_type, exception_value, traceback): if exception_type is not None: raise exception_type(exception_value) return True #*************************************************************** if __name__ == '__main__': """ """ from nparser import Configurable from nparser.misc.bucketer import Bucketer from scipy.stats import truncnorm with Bucketer(5) as bucketer: print(bucketer.compute_splits([[0] * np.int(truncnorm(0, 10, scale=5).rvs()) for _ in range(1000)]), file=sys.stderr) bucketer.plot()