Example #1
0
    def __init__(self, inputs_file, reference_file, vocab_file):
        with tf.io.gfile.GFile(inputs_file) as f:
            records = f.read().split("\n")
            inputs = [record.strip() for record in records]
            if not inputs[-1]:
                inputs.pop()

        self.ref_lines = tokenizer.native_to_unicode(
            tf.io.gfile.GFile(reference_file).read()).strip().splitlines()

        subtokenizer = Subtokenizer(vocab_file)
        self.batch = []
        token_lens = []
        for i, line in enumerate(inputs):
            enc = subtokenizer.encode(line, add_eos=True)
            token_lens.append((i, len(enc)))

        sorted_by_token_input_lens = sorted(token_lens,
                                            key=lambda x: x[1],
                                            reverse=True)

        sorted_inputs = [None] * len(sorted_by_token_input_lens)
        sorted_keys = [0] * len(sorted_by_token_input_lens)

        lines = []
        for i, (index, _) in enumerate(sorted_by_token_input_lens):
            sorted_inputs[i] = inputs[index]
            sorted_keys[index] = i
            enc = subtokenizer.encode(sorted_inputs[i], add_eos=True)
            lines.append([enc])
        for i in sorted_keys:
            self.batch.append(lines[i])
def input_generator_ws():
    """Read and sort lines from the file sorted by decreasing length based on word counts.
  Args:
    filename: String name of file to read inputs from.
  Returns:
    Sorted list of inputs, and dictionary mapping original index->sorted index
    of each element.
  """
    with tf.gfile.Open(FLAGS.file) as f:
        records = f.read().split("\n")
        inputs = [record.strip() for record in records]
        if not inputs[-1]:
            inputs.pop()

    batch = []

    subtokenizer = Subtokenizer(FLAGS.vocab_file)

    input_lens = [(i, len(line.split())) for i, line in enumerate(inputs)]
    sorted_input_lens = sorted(input_lens, key=lambda x: x[1], reverse=True)

    sorted_inputs = [None] * len(sorted_input_lens)
    sorted_keys = [0] * len(sorted_input_lens)
    for i, (index, _) in enumerate(sorted_input_lens):
        sorted_inputs[i] = inputs[index]
        sorted_keys[index] = i
        enc = subtokenizer.encode(sorted_inputs[i], add_eos=True)
        batch.append(enc)
    return batch, sorted_keys
Example #3
0
def input_generator_ts(file_path, vocab_file):
    """Read and sort lines based on token count from the file
    sorted by decreasing length based on token sorting.

    Args:
        file_path: String path of file to read
        vocab_file: String path of vocab file
    Returns:
        Sorted list of inputs, and dictionary mapping original index->sorted index
        of each element.
    """
    with tf.io.gfile.GFile(file_path) as f:
        records = f.read().split("\n")
        inputs = [record.strip() for record in records]
        if not inputs[-1]:
            inputs.pop()

    subtokenizer = Subtokenizer(vocab_file)

    batch = []
    token_lens = []
    for i, line in enumerate(inputs):
        enc = subtokenizer.encode(line, add_eos=True)
        token_lens.append((i, len(enc)))

    sorted_by_token_input_lens = sorted(token_lens,
                                        key=lambda x: x[1],
                                        reverse=True)
    sorted_inputs = [None] * len(sorted_by_token_input_lens)
    sorted_keys = [0] * len(sorted_by_token_input_lens)

    for i, (index, _) in enumerate(sorted_by_token_input_lens):
        sorted_inputs[i] = inputs[index]
        sorted_keys[index] = i
        enc = subtokenizer.encode(sorted_inputs[i], add_eos=True)
        batch.append(enc)

    return batch, sorted_keys