def __call__(self, markups): items = self.items(markups) seqs = chop_drop(items, self.seq_len) seqs = self.shuffle(seqs) chunks = chop(seqs, self.batch_size) for chunk in chunks: yield self.batch(chunk)
def __call__(self, texts): items = self.items(texts) seqs = self.seqs(items) seqs = self.shuffle(seqs) chunks = chop(seqs, self.batch_size) for chunk in chunks: yield self.batch(chunk)
def __call__(self, markups): markups = self.sort(markups) items = (self.item(_) for _ in markups) # 0.02% sents longer then 128, just drop them items = (_ for _ in items if len(_.word_ids) <= self.seq_len) chunks = chop(items, self.batch_size) for chunk in chunks: yield self.batch(chunk)
def __call__(self, items): items = (self.item(_) for _ in items) chunks = chop(items, self.batch_size) for chunk in chunks: yield self.input(chunk)
def test_chop(): guess = chop(range(10), 3) etalon = [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]] assert etalon == list(guess)
def __call__(self, markups): markups = self.sort(markups) items = (self.item(_) for _ in markups) chunks = chop(items, self.batch_size) for chunk in chunks: yield self.batch(chunk)