Python Subtokenizer.encode Examples

Programming Language: Python

Namespace/Package Name: utils.tokenizer

Class/Type: Subtokenizer

Method/Function: encode

Examples at hotexamples.com: 3

Python Subtokenizer.encode - 3 examples found. These are the top rated real world Python examples of utils.tokenizer.Subtokenizer.encode extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Subtokenizer(5)

encode(3)

decode(2)

Example #1

Show file

File: main.py Project: vuiseng9/lpot

    def __init__(self, inputs_file, reference_file, vocab_file):
        with tf.io.gfile.GFile(inputs_file) as f:
            records = f.read().split("\n")
            inputs = [record.strip() for record in records]
            if not inputs[-1]:
                inputs.pop()

        self.ref_lines = tokenizer.native_to_unicode(
            tf.io.gfile.GFile(reference_file).read()).strip().splitlines()

        subtokenizer = Subtokenizer(vocab_file)
        self.batch = []
        token_lens = []
        for i, line in enumerate(inputs):
            enc = subtokenizer.encode(line, add_eos=True)
            token_lens.append((i, len(enc)))

        sorted_by_token_input_lens = sorted(token_lens,
                                            key=lambda x: x[1],
                                            reverse=True)

        sorted_inputs = [None] * len(sorted_by_token_input_lens)
        sorted_keys = [0] * len(sorted_by_token_input_lens)

        lines = []
        for i, (index, _) in enumerate(sorted_by_token_input_lens):
            sorted_inputs[i] = inputs[index]
            sorted_keys[index] = i
            enc = subtokenizer.encode(sorted_inputs[i], add_eos=True)
            lines.append([enc])
        for i in sorted_keys:
            self.batch.append(lines[i])

Example #2

Show file

File: infer_ab.py Project: rodrigocamachoIntelLabs/models

def input_generator_ws():
    """Read and sort lines from the file sorted by decreasing length based on word counts.
  Args:
    filename: String name of file to read inputs from.
  Returns:
    Sorted list of inputs, and dictionary mapping original index->sorted index
    of each element.
  """
    with tf.gfile.Open(FLAGS.file) as f:
        records = f.read().split("\n")
        inputs = [record.strip() for record in records]
        if not inputs[-1]:
            inputs.pop()

    batch = []

    subtokenizer = Subtokenizer(FLAGS.vocab_file)

    input_lens = [(i, len(line.split())) for i, line in enumerate(inputs)]
    sorted_input_lens = sorted(input_lens, key=lambda x: x[1], reverse=True)

    sorted_inputs = [None] * len(sorted_input_lens)
    sorted_keys = [0] * len(sorted_input_lens)
    for i, (index, _) in enumerate(sorted_input_lens):
        sorted_inputs[i] = inputs[index]
        sorted_keys[index] = i
        enc = subtokenizer.encode(sorted_inputs[i], add_eos=True)
        batch.append(enc)
    return batch, sorted_keys

Example #3

Show file

def input_generator_ts(file_path, vocab_file):
    """Read and sort lines based on token count from the file
    sorted by decreasing length based on token sorting.

    Args:
        file_path: String path of file to read
        vocab_file: String path of vocab file
    Returns:
        Sorted list of inputs, and dictionary mapping original index->sorted index
        of each element.
    """
    with tf.io.gfile.GFile(file_path) as f:
        records = f.read().split("\n")
        inputs = [record.strip() for record in records]
        if not inputs[-1]:
            inputs.pop()

    subtokenizer = Subtokenizer(vocab_file)

    batch = []
    token_lens = []
    for i, line in enumerate(inputs):
        enc = subtokenizer.encode(line, add_eos=True)
        token_lens.append((i, len(enc)))

    sorted_by_token_input_lens = sorted(token_lens,
                                        key=lambda x: x[1],
                                        reverse=True)
    sorted_inputs = [None] * len(sorted_by_token_input_lens)
    sorted_keys = [0] * len(sorted_by_token_input_lens)

    for i, (index, _) in enumerate(sorted_by_token_input_lens):
        sorted_inputs[i] = inputs[index]
        sorted_keys[index] = i
        enc = subtokenizer.encode(sorted_inputs[i], add_eos=True)
        batch.append(enc)

    return batch, sorted_keys