Esempio n. 1
0
    def reinit(self, vocabs, parse_files):
        """ """

        self.preopen_parse_file = parse_files

        self._vocabs = vocabs
        self._multibuckets = [
            Multibucket.from_configurable(vocab,
                                          name='%s-%s' %
                                          (self.name, vocab.name))
            for vocab in self.vocabs
        ]
        self._metadata = []

        with Bucketer.from_configurable(self,
                                        self.n_buckets,
                                        name='bucketer-%s' %
                                        self.name) as bucketer:
            splits = bucketer.compute_splits(
                len(sent) for sent, metadata in self.iterfiles())
            for i in range(len(splits)):
                splits[i] += 1
        for multibucket, vocab in self.iteritems():
            multibucket.open(splits, depth=vocab.depth)
        for sent, metadata in self.iterfiles():
            self._metadata.append(metadata)
            for multibucket, vocab in self.iteritems():
                tokens = [line[vocab.conll_idx] for line in sent]
                idxs = [vocab.ROOT] + [vocab.index(token) for token in tokens]
                multibucket.add(idxs, tokens)
        for multibucket in self:
            multibucket.close()
        self._multibucket = Multibucket.from_dataset(self)

        return
Esempio n. 2
0
    def index_tokens(self):
        """ """

        n_buckets = self.n_buckets
        tok2idxs = {
            token: [vocab.subtoken_indices(token) for vocab in self]
            for token in self.token_vocab.counts
        }
        with Bucketer.from_configurable(self,
                                        self.n_buckets,
                                        name='bucketer-%s' %
                                        self.name) as bucketer:
            splits = bucketer.compute_splits(
                len(indices[0]) for indices in list(tok2idxs.values()))
            bucketer.plot()
        with self.multibucket.open(splits, depth=len(self)):
            for index, special_token in enumerate(self.special_tokens):
                self.tok2idx[special_token] = self.multibucket.add([[index] *
                                                                    len(self)])
            for token, _ in self.sorted_counts(self.token_vocab.counts):
                indices = tok2idxs[token]
                sequence = [[
                    indices[i][j] for i in range(len(indices))
                    if j < len(indices[i])
                ] for j in range(len(indices[0]))]
                self.tok2idx[token] = self.multibucket.add(sequence)
        return
Esempio n. 3
0
    def __init__(self, vocabs, *args, **kwargs):
        """ """

        nlp_model = kwargs.pop('nlp_model', None)
        if "parse_files" in kwargs and isinstance(
                kwargs["parse_files"],
                io.StringIO):  ### SPECIAL CASE - PARSING StringIO
            self.preopen_parse_file = kwargs.pop(
                "parse_files"
            )  #This doesn't really play well with the configparser thing
        else:
            self.preopen_parse_file = None
        super(Dataset, self).__init__(*args, **kwargs)

        self._vocabs = vocabs
        self._multibuckets = [
            Multibucket.from_configurable(vocab,
                                          name='%s-%s' %
                                          (self.name, vocab.name))
            for vocab in self.vocabs
        ]
        self._metadata = []

        if nlp_model is not None:
            self._nlp_model = nlp_model.from_configurable(self, name=self.name)
        else:
            self._nlp_model = None

        with Bucketer.from_configurable(self,
                                        self.n_buckets,
                                        name='bucketer-%s' %
                                        self.name) as bucketer:
            splits = bucketer.compute_splits(
                len(sent) for sent, metadata in self.iterfiles())
            for i in range(len(splits)):
                splits[i] += 1
        for multibucket, vocab in self.iteritems():
            multibucket.open(splits, depth=vocab.depth)
        for sent, metadata in self.iterfiles():
            self._metadata.append(metadata)
            for multibucket, vocab in self.iteritems():
                tokens = [line[vocab.conll_idx] for line in sent]
                idxs = [vocab.ROOT] + [vocab.index(token) for token in tokens]
                multibucket.add(idxs, tokens)
        for multibucket in self:
            multibucket.close()
        self._multibucket = Multibucket.from_dataset(self)

        return
Esempio n. 4
0
 def index_tokens(self):
   """ """
   
   self._tok2idx = {}
   tok2idxs = {token: self.subtoken_indices(token) for token in self.token_vocab.counts}
   with Bucketer.from_configurable(self, self.n_buckets, name='bucketer-%s'%self.name) as bucketer:
     splits = bucketer.compute_splits(len(indices) for indices in list(tok2idxs.values()))
   with self.multibucket.open(splits):
     for index, special_token in enumerate(self.token_vocab.special_tokens):
       index = index if index != self.token_vocab.UNK else self.META_UNK
       self.tok2idx[special_token] = self.multibucket.add([index])
     for token, _ in self.sorted_counts(self.token_vocab.counts):
       self.tok2idx[token] = self.multibucket.add(tok2idxs[token])
   self._idx2tok = {idx: tok for tok, idx in self.tok2idx.items()}
   self._idx2tok[0] = self[self.PAD]
   return