Exemple #1
0
    def index_tokens(self):
        """"""

        self._tok2idx = {}
        # {"apple":[12,6,6,2,8],"banana":[...]}
        # the index of each char in a word
        tok2idxs = {
            token: self.subtoken_indices(token)
            for token in self.token_vocab.counts
        }
        with Bucketer.from_configurable(self,
                                        self.n_buckets,
                                        name='bucketer-%s' %
                                        self.name) as bucketer:
            splits = bucketer.compute_splits(
                len(indices) for indices in tok2idxs.values())
        with self.multibucket.open(splits):
            for index, special_token in enumerate(
                    self.token_vocab.special_tokens):
                index = index if index != self.token_vocab.UNK else self.META_UNK
                self.tok2idx[special_token] = self.multibucket.add([index])
            for token, _ in self.sorted_counts(self.token_vocab.counts):
                self.tok2idx[token] = self.multibucket.add(tok2idxs[token])
        self._idx2tok = {idx: tok for tok, idx in self.tok2idx.iteritems()}
        self._idx2tok[0] = self[self.PAD]
        return
Exemple #2
0
    def index_tokens(self):
        """"""

        n_buckets = self.n_buckets
        tok2idxs = {
            token: [vocab.subtoken_indices(token) for vocab in self]
            for token in self.token_vocab.counts
        }
        with Bucketer.from_configurable(self,
                                        self.n_buckets,
                                        name='bucketer-%s' %
                                        self.name) as bucketer:
            splits = bucketer.compute_splits(
                len(indices[0]) for indices in tok2idxs.values())
            bucketer.plot()
        with self.multibucket.open(splits, depth=len(self)):
            for index, special_token in enumerate(self.special_tokens):
                self.tok2idx[special_token] = self.multibucket.add([[index] *
                                                                    len(self)])
            for token, _ in self.sorted_counts(self.token_vocab.counts):
                indices = tok2idxs[token]
                sequence = [[
                    indices[i][j] for i in xrange(len(indices))
                    if j < len(indices[i])
                ] for j in xrange(len(indices[0]))]
                self.tok2idx[token] = self.multibucket.add(sequence)
        return
Exemple #3
0
    def index_tokens(self):
        """"""
        #Associe à chaque token un indice
        self._tok2idx = {}
        tok2idxs = {
            token: self.subtoken_indices(token)
            for token in self.token_vocab.counts
        }

        #Calcule une séparation entre n buckets en fonction de la taille des mots
        with Bucketer.from_configurable(self,
                                        self.n_buckets,
                                        name='bucketer-%s' %
                                        self.name) as bucketer:
            #print("subtok.py - idx tok - " , (len(indices) for indices in tok2idxs.values()))
            splits = bucketer.compute_splits(
                len(indices) for indices in tok2idxs.values())

        #Remplit les n buckets avec les tokens en fonction de leur taille et sort des listes associatives
        with self.multibucket.open(splits):
            for index, special_token in enumerate(
                    self.token_vocab.special_tokens):
                index = index if index != self.token_vocab.UNK else self.META_UNK
                self.tok2idx[special_token] = self.multibucket.add([index])
            for token, _ in self.sorted_counts(self.token_vocab.counts):
                self.tok2idx[token] = self.multibucket.add(tok2idxs[token])
        self._idx2tok = {idx: tok for tok, idx in self.tok2idx.iteritems()}
        self._idx2tok[0] = self[self.PAD]
        return
Exemple #4
0
 def __init__(self, vocabs, *args, **kwargs):
   """"""
   # nlp_model = Parser
   nlp_model = kwargs.pop('nlp_model', None)
   #print ("---dataset.py---\n",nlp_model)
   super(Dataset, self).__init__(*args, **kwargs)
   
   self._vocabs = vocabs
   self._multibuckets = [Multibucket.from_configurable(vocab, name='%s-%s'%(self.name, vocab.name)) for vocab in self.vocabs]
   
   if nlp_model is not None:
     self._nlp_model = nlp_model.from_configurable(self, name=self.name)
   else:
     self._nlp_model = None
   #print ("---dataset.py---after\n",nlp_model)
   with Bucketer.from_configurable(self, self.n_buckets, name='bucketer-%s'%self.name) as bucketer:
     splits = bucketer.compute_splits(len(sent) for sent in self.iterfiles())
     for i in xrange(len(splits)):
       splits[i] += 1
   for multibucket, vocab in self.iteritems():
     multibucket.open(splits, depth=vocab.depth)
   for sent in self.iterfiles():
     for multibucket, vocab in self.iteritems():
       tokens = [line[vocab.conll_idx] for line in sent]
       idxs = [vocab.ROOT] + [vocab.index(token) for token in tokens]
       multibucket.add(idxs, tokens)
   for multibucket in self:
     multibucket.close()
   self._multibucket = Multibucket.from_dataset(self)
   return
Exemple #5
0
        return self._splits != value

    def __enter__(self):
        self._len2cnt = {}
        self._lengths = []
        self._counts = []
        self._splits = []
        self._lidxs = []
        return self

    def __exit__(self, exception_type, exception_value, traceback):
        if exception_type is not None:
            raise exception_type(exception_value)
        return True


#***************************************************************
if __name__ == '__main__':
    """"""

    from parser import Configurable
    from parser.misc.bucketer import Bucketer

    from scipy.stats import truncnorm
    with Bucketer(5) as bucketer:
        print(
            bucketer.compute_splits([[0] *
                                     np.int(truncnorm(0, 10, scale=5).rvs())
                                     for _ in xrange(1000)]))
        bucketer.plot()