def open(self, maxlens, depth=None): """""" self._indices = [(0,0)] self._buckets = [] self._len2idx = {} prevlen = -1 for idx, maxlen in enumerate(maxlens): self._buckets.append(Bucket.from_configurable(self, embed_model=self.embed_model, name='%s-%d' % (self.name, idx)).open(maxlen, depth=depth)) self._len2idx.update(zip(range(prevlen+1, maxlen+1), [idx]*(maxlen-prevlen))) prevlen = maxlen return self
def from_dataset(cls, dataset, *args, **kwargs): """""" multibucket = cls.from_configurable(dataset, *args, **kwargs) indices = [] for multibucket_ in dataset: indices.append(multibucket_.indices) for i in xrange(1, len(indices)): pass # assert np.equal(indices[0].astype(int), indices[i].astype(int)).all() multibucket._indices = np.array(multibucket_.indices) buckets = [Bucket.from_dataset(dataset, i, *args, **kwargs) for i in xrange(len(multibucket_))] multibucket._buckets = buckets if dataset.verbose: for bucket in multibucket: print('Bucket {name} is {shape}'.format(name=bucket.name, shape=ctext(' x '.join(str(x) for x in bucket.indices.shape), 'bright_blue'))) return multibucket
def from_dataset(cls, dataset, *args, **kwargs): """""" multibucket = cls.from_configurable(dataset, *args, **kwargs) indices = [] for multibucket_ in dataset: indices.append(multibucket_.indices) #Ici il construit les batchs en attribuant à chaque phrase l'id de son batch et son id relatif dans le batch #(phrase 1 dans 4:5 === phrase 1 du corpus mise dans le batch 4 à la position 5 ) #for i in xrange(1, len(indices)): # assert np.equal(indices[0].astype(int), indices[i].astype(int)).all() multibucket._indices = np.array(multibucket_.indices) buckets = [Bucket.from_dataset(dataset, i, *args, **kwargs) for i in xrange(len(multibucket_))] multibucket._buckets = buckets if dataset.verbose: for bucket in multibucket: print('Bucket {name} is {shape}'.format(name=bucket.name, shape=ctext(' x '.join(str(x) for x in bucket.indices.shape), 'bright_blue'))) return multibucket