def refill(batches, fdx, fdy, batch_size, FLAGS, sort_and_shuffle=True): line_pairs = [] linex = fdx.readline() # linex, liney = fdx.readline(), fdy.readline() while linex: y_tokens = tokenize(linex) # y_tokens are source of truth orig_str = "".join(reverse_vocab[x] for x in y_tokens) noisy_str = add_noise_to_string(orig_str, 0.2 / FLAGS['max_seq_len'], FLAGS['max_seq_len']) x_tokens = nlc_data.sentence_to_token_ids( noisy_str, vocab, tokenizer=get_tokenizer(FLAGS)) # x_tokens are noisy str if len(x_tokens) < FLAGS['max_seq_len'] and len( y_tokens) < FLAGS['max_seq_len']: line_pairs.append((x_tokens, y_tokens)) if len(line_pairs) == batch_size * 16: break linex = fdx.readline() # linex, liney = fdx.readline(), fdy.readline() if sort_and_shuffle: line_pairs = sorted(line_pairs, key=lambda e: len(e[0])) for batch_start in range(0, len(line_pairs), batch_size): x_batch, y_batch = zip(*line_pairs[batch_start:batch_start + batch_size]) # if len(x_batch) < batch_size: # break batches.append((x_batch, y_batch)) if sort_and_shuffle: random.shuffle(batches) return
def tokenize(sent, vocab, depth=FLAGS.num_layers): align = pow(2, depth - 1) token_ids = nlc_data.sentence_to_token_ids(sent, vocab, get_tokenizer(FLAGS)) ones = [1] * len(token_ids) pad = (align - len(token_ids)) % align token_ids += [nlc_data.PAD_ID] * pad ones += [0] * pad source = np.array(token_ids).reshape([-1, 1]) mask = np.array(ones).reshape([-1, 1]) return source, mask
def tokenize(sent, vocab, depth=FLAGS.num_layers): align = pow(2, depth - 1) token_ids = nlc_data.sentence_to_token_ids(sent, vocab, get_tokenizer(FLAGS)) ones = [1] * len(token_ids) pad = (align - len(token_ids)) % align token_ids += [nlc_data.PAD_ID] * pad ones += [0] * pad source = np.array(token_ids).reshape([-1, 1]) mask = np.array(ones).reshape([-1, 1]) return source, mask