Beispiel #1
0
    def __init__(self, inputs_file, reference_file, vocab_file):
        with tf.io.gfile.GFile(inputs_file) as f:
            records = f.read().split("\n")
            inputs = [record.strip() for record in records]
            if not inputs[-1]:
                inputs.pop()

        self.ref_lines = tokenizer.native_to_unicode(
            tf.io.gfile.GFile(reference_file).read()).strip().splitlines()

        subtokenizer = Subtokenizer(vocab_file)
        self.batch = []
        token_lens = []
        for i, line in enumerate(inputs):
            enc = subtokenizer.encode(line, add_eos=True)
            token_lens.append((i, len(enc)))

        sorted_by_token_input_lens = sorted(token_lens,
                                            key=lambda x: x[1],
                                            reverse=True)

        sorted_inputs = [None] * len(sorted_by_token_input_lens)
        sorted_keys = [0] * len(sorted_by_token_input_lens)

        lines = []
        for i, (index, _) in enumerate(sorted_by_token_input_lens):
            sorted_inputs[i] = inputs[index]
            sorted_keys[index] = i
            enc = subtokenizer.encode(sorted_inputs[i], add_eos=True)
            lines.append([enc])
        for i in sorted_keys:
            self.batch.append(lines[i])
def input_generator_ws():
    """Read and sort lines from the file sorted by decreasing length based on word counts.
  Args:
    filename: String name of file to read inputs from.
  Returns:
    Sorted list of inputs, and dictionary mapping original index->sorted index
    of each element.
  """
    with tf.gfile.Open(FLAGS.file) as f:
        records = f.read().split("\n")
        inputs = [record.strip() for record in records]
        if not inputs[-1]:
            inputs.pop()

    batch = []

    subtokenizer = Subtokenizer(FLAGS.vocab_file)

    input_lens = [(i, len(line.split())) for i, line in enumerate(inputs)]
    sorted_input_lens = sorted(input_lens, key=lambda x: x[1], reverse=True)

    sorted_inputs = [None] * len(sorted_input_lens)
    sorted_keys = [0] * len(sorted_input_lens)
    for i, (index, _) in enumerate(sorted_input_lens):
        sorted_inputs[i] = inputs[index]
        sorted_keys[index] = i
        enc = subtokenizer.encode(sorted_inputs[i], add_eos=True)
        batch.append(enc)
    return batch, sorted_keys
def _trim_and_decode(ids):
    """Trim EOS and PAD tokens from ids, and decode to return a string."""
    subtokenizer = Subtokenizer(FLAGS.vocab_file)
    try:
        index = list(ids).index(tokenizer.EOS_ID)
        return subtokenizer.decode(ids[:index])
    except ValueError:  # No EOS found in sequence
        return subtokenizer.decode(ids)
Beispiel #4
0
def eval_func(infer_graph, iteration=-1):
    if isinstance(infer_graph, tf.compat.v1.GraphDef):
        graph = tf.Graph()
        with graph.as_default():
            tf.import_graph_def(infer_graph, name='')
        infer_graph = graph

    subtokenizer = Subtokenizer(FLAGS.vocab_file)
    input_tensor = infer_graph.get_tensor_by_name('input_tensor:0')
    output_tensor = infer_graph.get_tensor_by_name(\
        'model/Transformer/strided_slice_19:0')
    ds = Dataset(FLAGS.inputs_file, FLAGS.reference_file, FLAGS.vocab_file)
    from lpot.data import DATALOADERS
    dataloader = DATALOADERS['tensorflow'](ds,
                                           batch_size=FLAGS.batch_size,
                                           collate_fn=collate_fn)
    config = tf.compat.v1.ConfigProto()
    config.use_per_session_threads = 1
    config.inter_op_parallelism_threads = 1
    sess = tf.compat.v1.Session(graph=infer_graph, config=config)
    time_list = []
    bleu_eval = bleu()
    predictions = []
    labels = []
    warmup = 10
    if iteration != -1:
        assert iteration >= warmup, 'iteration must be larger than warmup'
    for idx, (input_data, label) in enumerate(dataloader):
        if idx < iteration or iteration == -1:
            time_start = time.time()
            out = sess.run([output_tensor], {input_tensor: input_data})
            duration = time.time() - time_start
            time_list.append(duration)
            predictions.append(out)
            labels.extend(label)
        else:
            break
    latency = np.array(time_list[warmup:]).mean() / FLAGS.batch_size
    print('Batch size = {}'.format(FLAGS.batch_size))
    print('Latency: {:.3f} ms'.format(latency * 1000))
    print('Throughput: {:.3f} items/sec'.format(1. / latency))

    # only calculate accuracy when running out all predictions
    if iteration == -1:
        decode = []
        for i, tr in enumerate(predictions):
            for j, itr in enumerate(tr):
                for k, otr in enumerate(itr):
                    try:
                        index = list(otr).index(tokenizer.EOS_ID)
                        decode.append(subtokenizer.decode(otr[:index]))
                    except:
                        decode.append(subtokenizer.decode(otr))
        bleu_eval.update(decode, labels)
        print('Accuracy is {:.3f}'.format(bleu_eval.result()))
        return bleu_eval.result()
Beispiel #5
0
def input_generator_ts(file_path, vocab_file):
    """Read and sort lines based on token count from the file
    sorted by decreasing length based on token sorting.

    Args:
        file_path: String path of file to read
        vocab_file: String path of vocab file
    Returns:
        Sorted list of inputs, and dictionary mapping original index->sorted index
        of each element.
    """
    with tf.io.gfile.GFile(file_path) as f:
        records = f.read().split("\n")
        inputs = [record.strip() for record in records]
        if not inputs[-1]:
            inputs.pop()

    subtokenizer = Subtokenizer(vocab_file)

    batch = []
    token_lens = []
    for i, line in enumerate(inputs):
        enc = subtokenizer.encode(line, add_eos=True)
        token_lens.append((i, len(enc)))

    sorted_by_token_input_lens = sorted(token_lens,
                                        key=lambda x: x[1],
                                        reverse=True)
    sorted_inputs = [None] * len(sorted_by_token_input_lens)
    sorted_keys = [0] * len(sorted_by_token_input_lens)

    for i, (index, _) in enumerate(sorted_by_token_input_lens):
        sorted_inputs[i] = inputs[index]
        sorted_keys[index] = i
        enc = subtokenizer.encode(sorted_inputs[i], add_eos=True)
        batch.append(enc)

    return batch, sorted_keys