Python Subtokenizer Beispiele

Programmiersprache: Python

Namespace / Paketname: utils.tokenizer

Klasse / Typ: Subtokenizer

Beispiele auf hotexamples.com: 5

Python Subtokenizer - 5 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die utils.tokenizer.Subtokenizer, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

Subtokenizer(5)

encode(3)

decode(2)

Häufig verwendete Methoden

Subtokenizer (5)

encode (3)

decode (2)

Beispiel #1

Datei anzeigen

Datei: main.py Projekt: vuiseng9/lpot

    def __init__(self, inputs_file, reference_file, vocab_file):
        with tf.io.gfile.GFile(inputs_file) as f:
            records = f.read().split("\n")
            inputs = [record.strip() for record in records]
            if not inputs[-1]:
                inputs.pop()

        self.ref_lines = tokenizer.native_to_unicode(
            tf.io.gfile.GFile(reference_file).read()).strip().splitlines()

        subtokenizer = Subtokenizer(vocab_file)
        self.batch = []
        token_lens = []
        for i, line in enumerate(inputs):
            enc = subtokenizer.encode(line, add_eos=True)
            token_lens.append((i, len(enc)))

        sorted_by_token_input_lens = sorted(token_lens,
                                            key=lambda x: x[1],
                                            reverse=True)

        sorted_inputs = [None] * len(sorted_by_token_input_lens)
        sorted_keys = [0] * len(sorted_by_token_input_lens)

        lines = []
        for i, (index, _) in enumerate(sorted_by_token_input_lens):
            sorted_inputs[i] = inputs[index]
            sorted_keys[index] = i
            enc = subtokenizer.encode(sorted_inputs[i], add_eos=True)
            lines.append([enc])
        for i in sorted_keys:
            self.batch.append(lines[i])

Beispiel #2

Datei anzeigen

Datei: infer_ab.py Projekt: rodrigocamachoIntelLabs/models

def input_generator_ws():
    """Read and sort lines from the file sorted by decreasing length based on word counts.
  Args:
    filename: String name of file to read inputs from.
  Returns:
    Sorted list of inputs, and dictionary mapping original index->sorted index
    of each element.
  """
    with tf.gfile.Open(FLAGS.file) as f:
        records = f.read().split("\n")
        inputs = [record.strip() for record in records]
        if not inputs[-1]:
            inputs.pop()

    batch = []

    subtokenizer = Subtokenizer(FLAGS.vocab_file)

    input_lens = [(i, len(line.split())) for i, line in enumerate(inputs)]
    sorted_input_lens = sorted(input_lens, key=lambda x: x[1], reverse=True)

    sorted_inputs = [None] * len(sorted_input_lens)
    sorted_keys = [0] * len(sorted_input_lens)
    for i, (index, _) in enumerate(sorted_input_lens):
        sorted_inputs[i] = inputs[index]
        sorted_keys[index] = i
        enc = subtokenizer.encode(sorted_inputs[i], add_eos=True)
        batch.append(enc)
    return batch, sorted_keys

Beispiel #3

Datei anzeigen

Datei: infer_ab.py Projekt: rodrigocamachoIntelLabs/models

def _trim_and_decode(ids):
    """Trim EOS and PAD tokens from ids, and decode to return a string."""
    subtokenizer = Subtokenizer(FLAGS.vocab_file)
    try:
        index = list(ids).index(tokenizer.EOS_ID)
        return subtokenizer.decode(ids[:index])
    except ValueError:  # No EOS found in sequence
        return subtokenizer.decode(ids)

Beispiel #4

Datei anzeigen

Datei: main.py Projekt: vuiseng9/lpot

def eval_func(infer_graph, iteration=-1):
    if isinstance(infer_graph, tf.compat.v1.GraphDef):
        graph = tf.Graph()
        with graph.as_default():
            tf.import_graph_def(infer_graph, name='')
        infer_graph = graph

    subtokenizer = Subtokenizer(FLAGS.vocab_file)
    input_tensor = infer_graph.get_tensor_by_name('input_tensor:0')
    output_tensor = infer_graph.get_tensor_by_name(\
        'model/Transformer/strided_slice_19:0')
    ds = Dataset(FLAGS.inputs_file, FLAGS.reference_file, FLAGS.vocab_file)
    from lpot.data import DATALOADERS
    dataloader = DATALOADERS['tensorflow'](ds,
                                           batch_size=FLAGS.batch_size,
                                           collate_fn=collate_fn)
    config = tf.compat.v1.ConfigProto()
    config.use_per_session_threads = 1
    config.inter_op_parallelism_threads = 1
    sess = tf.compat.v1.Session(graph=infer_graph, config=config)
    time_list = []
    bleu_eval = bleu()
    predictions = []
    labels = []
    warmup = 10
    if iteration != -1:
        assert iteration >= warmup, 'iteration must be larger than warmup'
    for idx, (input_data, label) in enumerate(dataloader):
        if idx < iteration or iteration == -1:
            time_start = time.time()
            out = sess.run([output_tensor], {input_tensor: input_data})
            duration = time.time() - time_start
            time_list.append(duration)
            predictions.append(out)
            labels.extend(label)
        else:
            break
    latency = np.array(time_list[warmup:]).mean() / FLAGS.batch_size
    print('Batch size = {}'.format(FLAGS.batch_size))
    print('Latency: {:.3f} ms'.format(latency * 1000))
    print('Throughput: {:.3f} items/sec'.format(1. / latency))

    # only calculate accuracy when running out all predictions
    if iteration == -1:
        decode = []
        for i, tr in enumerate(predictions):
            for j, itr in enumerate(tr):
                for k, otr in enumerate(itr):
                    try:
                        index = list(otr).index(tokenizer.EOS_ID)
                        decode.append(subtokenizer.decode(otr[:index]))
                    except:
                        decode.append(subtokenizer.decode(otr))
        bleu_eval.update(decode, labels)
        print('Accuracy is {:.3f}'.format(bleu_eval.result()))
        return bleu_eval.result()

Beispiel #5

Datei anzeigen

def input_generator_ts(file_path, vocab_file):
    """Read and sort lines based on token count from the file
    sorted by decreasing length based on token sorting.

    Args:
        file_path: String path of file to read
        vocab_file: String path of vocab file
    Returns:
        Sorted list of inputs, and dictionary mapping original index->sorted index
        of each element.
    """
    with tf.io.gfile.GFile(file_path) as f:
        records = f.read().split("\n")
        inputs = [record.strip() for record in records]
        if not inputs[-1]:
            inputs.pop()

    subtokenizer = Subtokenizer(vocab_file)

    batch = []
    token_lens = []
    for i, line in enumerate(inputs):
        enc = subtokenizer.encode(line, add_eos=True)
        token_lens.append((i, len(enc)))

    sorted_by_token_input_lens = sorted(token_lens,
                                        key=lambda x: x[1],
                                        reverse=True)
    sorted_inputs = [None] * len(sorted_by_token_input_lens)
    sorted_keys = [0] * len(sorted_by_token_input_lens)

    for i, (index, _) in enumerate(sorted_by_token_input_lens):
        sorted_inputs[i] = inputs[index]
        sorted_keys[index] = i
        enc = subtokenizer.encode(sorted_inputs[i], add_eos=True)
        batch.append(enc)

    return batch, sorted_keys