Exemple #1
0
 def generator():
     # custom bucketing, load corpus into memory
     corpus = list(x for x in (samples() if callable(samples) else samples))
     lengths = [self.len_of_sent(i) for i in corpus]
     if len(corpus) < 32:
         n_buckets = 1
     else:
         n_buckets = min(self.config.n_buckets, len(corpus))
     buckets = dict(zip(*kmeans(lengths, n_buckets)))
     sizes, buckets = zip(*[
         (size, bucket) for size, bucket in buckets.items()
     ])
     # the number of chunks in each bucket, which is clipped by
     # range [1, len(bucket)]
     chunks = [min(len(bucket), max(round(size * len(bucket) / batch_size), 1)) for size, bucket in
               zip(sizes, buckets)]
     range_fn = randperm if shuffle else arange
     max_samples_per_batch = self.config.get('max_samples_per_batch', None)
     for i in tolist(range_fn(len(buckets))):
         split_sizes = [(len(buckets[i]) - j - 1) // chunks[i] + 1
                        for j in range(chunks[i])]  # how many sentences in each batch
         for batch_indices in tf.split(range_fn(len(buckets[i])), split_sizes):
             indices = [buckets[i][j] for j in tolist(batch_indices)]
             if max_samples_per_batch:
                 for j in range(0, len(indices), max_samples_per_batch):
                     yield from self.batched_inputs_to_batches(corpus, indices[j:j + max_samples_per_batch],
                                                               shuffle)
             else:
                 yield from self.batched_inputs_to_batches(corpus, indices, shuffle)
Exemple #2
0
 def generator():
     # custom bucketing, load corpus into memory
     corpus = list(x for x in (samples() if callable(samples) else samples))
     lengths = [1 + len(i) for i in corpus]
     if len(corpus) < 32:
         n_buckets = 1
     else:
         n_buckets = min(self.config.n_buckets, len(corpus))
     buckets = dict(zip(*kmeans(lengths, n_buckets)))
     sizes, buckets = zip(*[
         (size, bucket) for size, bucket in buckets.items()
     ])
     # the number of chunks in each bucket, which is clipped by
     # range [1, len(bucket)]
     chunks = [min(len(bucket), max(round(size * len(bucket) / batch_size), 1)) for size, bucket in
               zip(sizes, buckets)]
     range_fn = randperm if shuffle else arange
     for i in tolist(range_fn(len(buckets))):
         split_sizes = [(len(buckets[i]) - j - 1) // chunks[i] + 1
                        for j in range(chunks[i])]
         for batch_indices in tf.split(range_fn(len(buckets[i])), split_sizes):
             indices = [buckets[i][j] for j in tolist(batch_indices)]
             raw_batch = [[], [], [], []]
             max_len = len(max([corpus[i] for i in indices], key=len))
             for idx in indices:
                 arc = np.zeros((max_len, max_len), dtype=np.bool)
                 rel = np.zeros((max_len, max_len), dtype=np.int64)
                 for b in raw_batch[:2]:
                     b.append([])
                 for m, cells in enumerate(corpus[idx]):
                     for b, c, v in zip(raw_batch, cells,
                                        [self.form_vocab, self.cpos_vocab]):
                         b[-1].append(v.get_idx_without_add(c))
                     for n, r in zip(cells[2], cells[3]):
                         arc[m, n] = True
                         rid = self.rel_vocab.get_idx_without_add(r)
                         if rid is None:
                             logger.warning(f'Relation OOV: {r} not exists in train')
                             continue
                         rel[m, n] = rid
                 raw_batch[-2].append(arc)
                 raw_batch[-1].append(rel)
             batch = []
             for b, v in zip(raw_batch, [self.form_vocab, self.cpos_vocab]):
                 b = tf.keras.preprocessing.sequence.pad_sequences(b, padding='post',
                                                                   value=v.safe_pad_token_idx,
                                                                   dtype='int64')
                 batch.append(b)
             batch += raw_batch[2:]
             assert len(batch) == 4
             yield (batch[0], batch[1]), (batch[2], batch[3])
Exemple #3
0
 def generator():
     # custom bucketing, load corpus into memory
     corpus = list(
         x for x in (samples() if callable(samples) else samples))
     lengths = [1 + len(i) for i in corpus]
     if len(corpus) < 32:
         n_buckets = 1
     else:
         n_buckets = min(self.config.n_buckets, len(corpus))
     buckets = dict(zip(*kmeans(lengths, n_buckets)))
     sizes, buckets = zip(*[(size, bucket)
                            for size, bucket in buckets.items()])
     # the number of chunks in each bucket, which is clipped by
     # range [1, len(bucket)]
     chunks = [
         min(len(bucket), max(round(size * len(bucket) / batch_size),
                              1))
         for size, bucket in zip(sizes, buckets)
     ]
     range_fn = randperm if shuffle else arange
     for i in tolist(range_fn(len(buckets))):
         split_sizes = [(len(buckets[i]) - j - 1) // chunks[i] + 1
                        for j in range(chunks[i])]
         for batch_indices in tf.split(range_fn(len(buckets[i])),
                                       split_sizes):
             indices = [buckets[i][j] for j in tolist(batch_indices)]
             raw_batch = [[], [], [], []]
             for idx in indices:
                 for b in raw_batch:
                     b.append([])
                 for cells in corpus[idx]:
                     for b, c, v in zip(raw_batch, cells, [
                             self.form_vocab, self.cpos_vocab, None,
                             self.rel_vocab
                     ]):
                         b[-1].append(
                             v.get_idx_without_add(c) if v else c)
             batch = []
             for b, v in zip(raw_batch, [
                     self.form_vocab, self.cpos_vocab, None,
                     self.rel_vocab
             ]):
                 b = tf.keras.preprocessing.sequence.pad_sequences(
                     b,
                     padding='post',
                     value=v.safe_pad_token_idx if v else 0,
                     dtype='int64')
                 batch.append(b)
             assert len(batch) == 4
             yield (batch[0], batch[1]), (batch[2], batch[3])
Exemple #4
0
    def __init__(self, lengths, batch_max_tokens, batch_size=None, shuffle=False, n_buckets=1):
        """A bucket sampler which groups samples using KMeans on their lengths.

        Args:
            lengths: Lengths of each sample, usually measured by number of tokens.
            batch_max_tokens: Maximum tokens per batch.
            batch_size: Maximum samples per batch.
            shuffle: ``True`` to shuffle batches. Samples in the same batch won't be shuffled since the ordered sequence
                    is helpful to speed up RNNs.
            n_buckets: Number of buckets. Clusters in terms of KMeans.
        """
        if n_buckets > len(lengths):
            n_buckets = 1
        self.n_buckets = n_buckets
        self.lengths = lengths
        buckets = dict(zip(*kmeans(self.lengths, n_buckets)))
        super().__init__(buckets, batch_max_tokens, batch_size, shuffle)