Example #1
0
 def open(self, maxlens, depth=None):
   """"""
   
   self._indices = [(0,0)]
   self._buckets = []
   self._len2idx = {}
   prevlen = -1
   for idx, maxlen in enumerate(maxlens):
     self._buckets.append(Bucket.from_configurable(self, embed_model=self.embed_model, name='%s-%d' % (self.name, idx)).open(maxlen, depth=depth))
     self._len2idx.update(zip(range(prevlen+1, maxlen+1), [idx]*(maxlen-prevlen)))
     prevlen = maxlen
   return self
Example #2
0
 def from_dataset(cls, dataset, *args, **kwargs):
   """"""
   
   multibucket = cls.from_configurable(dataset, *args, **kwargs)
   indices = []
   for multibucket_ in dataset:
     indices.append(multibucket_.indices)
   for i in xrange(1, len(indices)):
     pass
     # assert np.equal(indices[0].astype(int), indices[i].astype(int)).all()
   multibucket._indices = np.array(multibucket_.indices)
   buckets = [Bucket.from_dataset(dataset, i, *args, **kwargs) for i in xrange(len(multibucket_))]
   multibucket._buckets = buckets
   if dataset.verbose:
     for bucket in multibucket:
       print('Bucket {name} is {shape}'.format(name=bucket.name, shape=ctext(' x '.join(str(x) for x in bucket.indices.shape), 'bright_blue')))
   return multibucket
Example #3
0
 def from_dataset(cls, dataset, *args, **kwargs):
   """"""
   
   multibucket = cls.from_configurable(dataset, *args, **kwargs)
   indices = []
   for multibucket_ in dataset:
     indices.append(multibucket_.indices)
   #Ici il construit les batchs en attribuant à chaque phrase l'id de son batch et son id relatif dans le batch 
   #(phrase 1 dans 4:5 === phrase 1 du corpus mise dans le batch 4 à la position 5 )
   
   #for i in xrange(1, len(indices)):
   #  assert np.equal(indices[0].astype(int), indices[i].astype(int)).all()
   multibucket._indices = np.array(multibucket_.indices)
   buckets = [Bucket.from_dataset(dataset, i, *args, **kwargs) for i in xrange(len(multibucket_))]
   multibucket._buckets = buckets
   if dataset.verbose:
     for bucket in multibucket:
       print('Bucket {name} is {shape}'.format(name=bucket.name, shape=ctext(' x '.join(str(x) for x in bucket.indices.shape), 'bright_blue')))
   return multibucket