Beispiel #1
0
    np_datatype = np.float32
    if args.data_type == "fp16":
        tf_datatype = tf.float16
        np_datatype = np.float16
    use_XLA = args.use_XLA
    beam_search_diversity_rate = args.beam_search_diversity_rate
    sampling_topk = args.sampling_topk
    sampling_topp = args.sampling_topp

    hidden_dim = head_num * size_per_head
    memory_hidden_dim = args.memory_hidden_dim

    decoder_args = TransformerArgument(
        beam_width=beam_width,
        head_num=head_num,
        size_per_head=size_per_head,
        num_layer=num_layer,
        dtype=tf_datatype,
        kernel_init_range=kernel_initializer_range,
        bias_init_range=bias_initializer_range)

    decoding_args = DecodingBeamsearchArgument(vocab_size,
                                               start_of_sentence_id,
                                               end_of_sentence_id, max_seq_len,
                                               decoder_args,
                                               beam_search_diversity_rate)

    decoder_args_2 = copy.deepcopy(decoder_args)  # for beam search
    decoder_args_2.__dict__ = copy.deepcopy(decoder_args.__dict__)
    decoder_args_2.beam_width = 1  # for sampling

    decoding_sampling_args = DecodingSamplingArgument(
    if avg_seq_len != -1 and remove_padding == True:
        # This means we use "remove_padding" and set a smaller average sequence length
        sequence_length = np.ones(batch_size) * avg_seq_len

    from_data = np.random.randn(batch_size, max_seq_len, hidden_dim)
    from_tensor = tf.convert_to_tensor(from_data, dtype=tf_datatype)

    attention_mask = build_sequence_mask(sequence_length,
                                         num_heads=head_num,
                                         maximum_length=max_seq_len,
                                         dtype=tf_datatype)

    encoder_notInt8_args = TransformerArgument(beam_width=1,
                                               head_num=head_num,
                                               size_per_head=size_per_head,
                                               num_layer=num_layer,
                                               dtype=tf_datatype,
                                               remove_padding=remove_padding,
                                               int8_mode=0)

    encoder_Int8_v1_args = TransformerArgument(beam_width=1,
                                               head_num=head_num,
                                               size_per_head=size_per_head,
                                               num_layer=num_layer,
                                               dtype=tf_datatype,
                                               remove_padding=remove_padding,
                                               int8_mode=1)

    encoder_Int8_v2_args = TransformerArgument(beam_width=1,
                                               head_num=head_num,
                                               size_per_head=size_per_head,
def sample_model(vocab_file="models/gpt2-vocab.json",
                 bpe_file="models/gpt2-merges.txt",
                 model_name='124M',
                 nsamples=1,
                 batch_size=1,
                 length=12,
                 temperature=1,
                 top_k=4,
                 top_p=0,
                 models_dir='models',
                 data_type='fp32'):
    """Run the sample_model.

    :model_name=124M : String, which model to use
    :nsamples=0 : Number of samples to return, if 0, continues to
     generate samples indefinately.
    :batch_size=1 : Number of batches (only affects speed/memory).
    :length=None : Number of tokens in generated text, if None (default), is
     determined by model hyperparameters
    :temperature=1 : Float value controlling randomness in boltzmann
     distribution. Lower temperature results in less random completions. As the
     temperature approaches zero, the model will become deterministic and
     repetitive. Higher temperature results in more random completions.
    :top_k=4 : Integer value controlling diversity. 1 means only 1 word is
     considered for each step (token), resulting in deterministic completions,
     while 40 means 40 words are considered at each step. 0 (default) is a
     special setting meaning no restrictions. 40 generally is a good value.
     :models_dir : path to parent folder containing model subfolders
     (i.e. contains the <model_name> folder)
    """
    np.random.seed(1)
    tf.set_random_seed(1)

    if data_type == 'fp32':
        tf_data_type = tf.float32
    elif data_type == 'fp16':
        tf_data_type = tf.float16
    else:
        assert (False)

    models_dir = os.path.expanduser(os.path.expandvars(models_dir))
    vocab_file = os.path.join(models_dir, model_name, 'encoder.json')
    bpe_file = os.path.join(models_dir, model_name, 'vocab.bpe')
    enc = encoder.get_encoder(vocab_file, bpe_file)
    hparams = HParams(n_vocab=0, n_ctx=1024, n_embd=768, n_head=12, n_layer=12)

    with open(os.path.join(models_dir, model_name, 'hparams.json')) as f:
        hparams.override_from_dict(json.load(f))

    if length is None:
        length = hparams.n_ctx
    elif length > hparams.n_ctx:
        raise ValueError("Can't get samples longer than window size: %s" %
                         hparams.n_ctx)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(graph=tf.Graph(), config=config) as sess:
        saver = tf.train.import_meta_graph("{}/{}/model.ckpt.meta".format(
            models_dir, model_name))

        lengths = np.random.randint(low=1, high=8, size=batch_size)
        min_start_length = lengths.min()
        max_start_length = lengths.max()
        attention_mask = np.tile(np.tri(min_start_length), (batch_size, 1, 1))

        start_ids = np.ones([batch_size, max_start_length
                             ]) * enc.encoder['<|endoftext|>']
        for i in range(batch_size):
            start_ids[i][0:lengths[i]] = 198
        # User can put some real start ids here, we use '\n' (198) here.

        sess.run(tf.global_variables_initializer())
        print("[INFO] restore the model {}/{}".format(models_dir, model_name))
        saver.restore(sess,
                      ("{}/{}/model.ckpt".format(models_dir, model_name)))

        decoder_args = TransformerArgument(beam_width=1,
                                           head_num=hparams.n_head,
                                           size_per_head=hparams.n_embd //
                                           hparams.n_head,
                                           num_layer=hparams.n_layer,
                                           dtype=tf_data_type,
                                           kernel_init_range=0.00,
                                           bias_init_range=0.00)

        decoding_args = DecodingGpt2Argument(hparams.n_vocab,
                                             enc.encoder['<|endoftext|>'],
                                             enc.encoder['<|endoftext|>'],
                                             length + 2, decoder_args, top_k,
                                             top_p, temperature)

        ckpt_dict = {}
        for var in tf.trainable_variables():
            ckpt_dict[var.name] = var
        decoding_vars = tf.trainable_variables()

        op_output = ft_gpt_op(decoding_vars, decoding_args, batch_size,
                              start_ids, min_start_length, max_start_length,
                              attention_mask)

        generated = 0

        while nsamples == 0 or generated < nsamples:
            op_out = sess.run(op_output)

            for i in range(batch_size):
                generated += 1

                text = enc.decode(op_out[i])
                print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
                print(text)
Beispiel #4
0
    from_data = np.random.randn(batch_size, seq_len, encoder_hidden_dim)
    from_tensor = tf.convert_to_tensor(from_data, dtype=tf_datatype)
    memory_sequence_length = np.random.randint(1,
                                               max_seq_len + 1,
                                               size=batch_size).astype(
                                                   np.int32)
    embedding_table = np.random.randn(vocab_size, decoder_hidden_dim).astype(
        np_datatype)  # a [vocab_size, decoder_hidden_dim] table

    mask = np.random.randint(2, size=(batch_size, seq_len, seq_len))
    attention_mask = tf.convert_to_tensor(mask, dtype=tf_datatype)

    encoder_args = TransformerArgument(batch_size=batch_size,
                                       beam_width=1,
                                       head_num=encoder_head_num,
                                       size_per_head=encoder_size_per_head,
                                       num_layer=encoder_num_layer,
                                       max_seq_len=max_seq_len,
                                       dtype=tf_datatype)

    decoding_args = DecodingArgument(batch_size=batch_size,
                                     beam_width=beam_width,
                                     head_num=decoder_head_num,
                                     size_per_head=decoder_size_per_head,
                                     num_layer=decoder_num_layer,
                                     max_seq_len=max_seq_len,
                                     vocab_size=vocab_size,
                                     start_id=start_of_sentence_id,
                                     end_id=end_of_sentence_id,
                                     encoder_hidden_dim=encoder_head_num *
                                     encoder_size_per_head,
def encoder_sample(args_dict):
    print("\n=============== Argument ===============")
    for key in args_dict:
        print("{}: {}".format(key, args_dict[key]))
    print("========================================")

    np.random.seed(1)
    tf.set_random_seed(1)

    batch_size = args_dict['batch_size']
    num_layer = args_dict['num_layer']
    max_seq_len = args_dict['max_seq_len']
    avg_seq_len = args_dict['avg_seq_len']
    head_num = args_dict['head_number']
    size_per_head = args_dict['size_per_head']
    tf_datatype = tf.float32
    np_datatype = np.float32
    atol_threshold = 3e-5
    int8_mode = args_dict['int8_mode']
    allow_gemm_test = True if args_dict['allow_gemm_test'].lower() == "true" else False
    if args_dict['data_type'] == "fp16":
        tf_datatype = tf.float16
        np_datatype = np.float16
        atol_threshold = 3e-2

    hidden_dim = head_num * size_per_head

    sequence_length = np.random.randint(1, max_seq_len + 1, size=batch_size)
    if avg_seq_len != -1:
        # This means we use "remove_padding" and set other average sequence length
        sequence_length = np.ones(batch_size) * avg_seq_len
    else:
        sequence_length = np.ones(batch_size) * (max_seq_len / 2)
    sequence_length = sequence_length.astype(np.int32)

    from_data = np.random.randn(batch_size, max_seq_len, hidden_dim)
    from_tensor = tf.convert_to_tensor(from_data, dtype=tf_datatype)
    
    attention_mask = build_sequence_mask(sequence_length, num_heads=head_num, maximum_length=max_seq_len, dtype=tf_datatype)
    
    encoder_args = TransformerArgument(beam_width=1,
                                       head_num=head_num,
                                       size_per_head=size_per_head,
                                       num_layer=num_layer,
                                       dtype=tf_datatype,
                                       remove_padding=False,
                                       int8_mode=int8_mode,
                                       allow_gemm_test=allow_gemm_test)

    eff_encoder_args = copy.deepcopy(encoder_args)
    eff_encoder_args.remove_padding = True

    tf_encoder_result = tf_encoder(input_tensor=from_tensor,
                                   encoder_args=encoder_args,
                                   attention_mask=attention_mask)

    encoder_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
    encoder_variables_dict = {}
    for v in encoder_vars:
        encoder_variables_dict[v.name] = v
    
    op_encoder_result = op_encoder(inputs=from_tensor,
                                   encoder_args=encoder_args,
                                   attention_mask=attention_mask,
                                   encoder_vars_dict=encoder_variables_dict,
                                   sequence_length=sequence_length)

    eff_encoder_result = op_encoder(inputs=from_tensor,
                                    encoder_args=eff_encoder_args,
                                    attention_mask=attention_mask,
                                    encoder_vars_dict=encoder_variables_dict,
                                    sequence_length=sequence_length)

    '''
    Because FasterTransformer skip some computation for the padding parts, 
    if we do not mask these parts, the cross check result would be wrong. 
    '''
    tf_encoder_result = tf_encoder_result * tf.expand_dims(tf.sequence_mask(sequence_length, maxlen=max_seq_len, dtype=tf_datatype), axis=-1)
    op_encoder_result = op_encoder_result * tf.expand_dims(tf.sequence_mask(sequence_length, maxlen=max_seq_len, dtype=tf_datatype), axis=-1)
    eff_encoder_result = eff_encoder_result * tf.expand_dims(tf.sequence_mask(sequence_length, maxlen=max_seq_len, dtype=tf_datatype), axis=-1)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
    
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())

        for idx, name in enumerate(encoder_variables_dict):
            print((str(idx) + " " + str(name) + " " +
                   str(encoder_variables_dict[name].shape)) + " " + str(encoder_variables_dict[name].dtype))
            
        print("#################################")
        tf_encoder_result_val = sess.run(tf_encoder_result)
        op_encoder_result_val = sess.run(op_encoder_result)
        eff_encoder_result_val = sess.run(eff_encoder_result)

        cross_check("Encoder TF v.s. FT with tensor input", tf_encoder_result_val, op_encoder_result_val, atol_threshold)
        cross_check("Encoder TF v.s. EFF-FT with tensor input", tf_encoder_result_val, eff_encoder_result_val, atol_threshold)
        
        op_diff = abs(tf_encoder_result_val.reshape([-1]) - op_encoder_result_val.reshape([-1]))
        eff_diff = abs(tf_encoder_result_val.reshape([-1]) - eff_encoder_result_val.reshape([-1]))
        max_diff = max(op_diff.max(), eff_diff.max())

        ite = 50
        def _cond(from_tensor):
            return tf.constant(True)
            
        def _ft_body(from_tensor):
            op_encoder_result = op_encoder(inputs=from_tensor,
                                            encoder_args=encoder_args,
                                            attention_mask=attention_mask,
                                            encoder_vars_dict=encoder_variables_dict,
                                            sequence_length=sequence_length)
            return op_encoder_result

        def _eff_body(from_tensor):
            eff_encoder_result = op_encoder(inputs=from_tensor,
                                            encoder_args=eff_encoder_args,
                                            attention_mask=attention_mask,
                                            encoder_vars_dict=encoder_variables_dict,
                                            sequence_length=sequence_length)
            return eff_encoder_result

        def _tf_body(from_tensor):
            tf_encoder_result = tf_encoder(input_tensor=from_tensor,
                                            encoder_args=encoder_args,
                                            attention_mask=attention_mask)
            return tf_encoder_result

        tf_while_tensor = tf.while_loop(_cond,
                                        _tf_body,
                                        loop_vars=[from_tensor],
                                        back_prop=False,
                                        maximum_iterations=ite)

        ft_while_tensor = tf.while_loop(_cond,
                                        _ft_body,
                                        loop_vars=[from_tensor],
                                        back_prop=False,
                                        maximum_iterations=ite)

        eff_while_tensor = tf.while_loop(_cond,
                                         _eff_body,
                                         loop_vars=[from_tensor],
                                         back_prop=False,
                                         maximum_iterations=ite)

        if args_dict['test_time'] == 1:

            # tf_time = time_test(sess, tf_encoder_result, ite)
            # ft_time = time_test(sess, op_encoder_result, ite)
            # eff_time = time_test(sess, eff_encoder_result, ite)

            # Using while loop to run 'ite' times to ignore the overheads of memory copy and model preprocess.
            # We use these times as the profiling results.
            tf_while_time = time_test(sess, tf_while_tensor, 1) / ite # while_loop has run ite times
            time.sleep(60)
            ft_while_time = time_test(sess, ft_while_tensor, 1) / ite # while_loop has run ite times
            time.sleep(60)
            eff_while_time = time_test(sess, eff_while_tensor, 1) / ite # while_loop has run ite times
            time.sleep(60)
            
            ft_type = args_dict['data_type'].upper()
            if int8_mode != 0:
                ft_type = "INT8-v{}".format(int8_mode)
            
            # print("[INFO] batch_size {} max_seq_len {} precision {} {} layer TF-time     {:6.2f} ms".format(batch_size, max_seq_len, args_dict['data_type'].upper(), num_layer, tf_time))
            # print("[INFO] batch_size {} max_seq_len {} precision {} {} layer FT-OP-time  {:6.2f} ms".format(batch_size, max_seq_len, ft_type, num_layer, ft_time))
            # print("[INFO] batch_size {} max_seq_len {} precision {} {} layer EFF-OP-time {:6.2f} ms".format(batch_size, max_seq_len, ft_type, num_layer, eff_time))

            print("[INFO] batch_size {} max_seq_len {} precision {} {} layer TF-while-time     {:6.2f} ms ( {} iterations)".format(batch_size, max_seq_len, args_dict['data_type'].upper(), num_layer, tf_while_time, ite))
            print("[INFO] batch_size {} max_seq_len {} precision {} {} layer FT-OP-while-time  {:6.2f} ms ( {} iterations)".format(batch_size, max_seq_len, ft_type, num_layer, ft_while_time, ite))
            print("[INFO] batch_size {} max_seq_len {} precision {} {} layer EFF-OP-while-time {:6.2f} ms ( {} iterations)".format(batch_size, max_seq_len, ft_type, num_layer, eff_while_time, ite))


        if args_dict['thread_num'] > 1:
            # Multi-threading demonstration
            thread_list = []
            thread_num = args_dict['thread_num']
            def run():
                ft_while_time = time_test(sess, ft_while_tensor, 1) / ite # while_loop has run ite times
                print("[INFO] batch_size {} max_seq_len {} {} layer FT-OP-while-time {:6.2f} ms with {} threads".format(batch_size,
                    max_seq_len, num_layer, ft_while_time, thread_num))

            for i in range(thread_num):
                thread_list.append(threading.Thread(target=run, name="RunFT"))
            for t in thread_list:
                t.start()
            for t in thread_list:
                t.join()

        return max_diff
    size_per_head = args.size_per_head
    num_layer = args.num_layer
    hidden_dim = head_num * size_per_head
    memory_hidden_dim = args.memory_hidden_dim
    vocab_size = args.vocab_size
    tf_datatype = tf.float32
    np_datatype = np.float32
    if args.data_type == "fp16":
        tf_datatype = tf.float16
        np_datatype = np.float16

    decoder_args = TransformerArgument(
        beam_width=beam_width,
        head_num=head_num,
        size_per_head=size_per_head,
        num_layer=num_layer,
        dtype=tf_datatype,
        kernel_init_range=kernel_initializer_range,
        bias_init_range=bias_initializer_range,
        fuse_qkv=True,
        memory_hidden_dim=memory_hidden_dim)

    decoding_args = DecodingBeamsearchArgument(vocab_size,
                                               start_of_sentence_id,
                                               end_of_sentence_id, max_seq_len,
                                               decoder_args, 0.0)

    embedding_table = np.random.randn(vocab_size, hidden_dim).astype(
        np_datatype) * 0.01  # a [vocab_size, hidden_dim] table
    embedding_table = tf.convert_to_tensor(embedding_table)
    memory, memory_sequence_length = generate_encoder_result(
        batch_size, max_seq_len, memory_hidden_dim, tf_datatype)
                                        size=batch_size).astype(np.int32)
    if avg_seq_len != -1 and remove_padding == True:
        # This means we use "remove_padding" and set a smaller average sequence length
        sequence_length = np.ones(batch_size) * avg_seq_len

    from_data = np.random.randn(batch_size, max_seq_len, hidden_dim)
    from_tensor = tf.convert_to_tensor(from_data, dtype=tf_datatype)

    attention_mask = build_sequence_mask(sequence_length,
                                         num_heads=head_num,
                                         maximum_length=max_seq_len,
                                         dtype=tf_datatype)

    encoder_args = TransformerArgument(beam_width=1,
                                       head_num=head_num,
                                       size_per_head=size_per_head,
                                       num_layer=num_layer,
                                       dtype=tf_datatype,
                                       remove_padding=remove_padding)

    tf_encoder_result = tf_encoder(input_tensor=from_tensor,
                                   encoder_args=encoder_args,
                                   attention_mask=attention_mask)

    encoder_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
    encoder_variables_dict = {}
    for v in encoder_vars:
        encoder_variables_dict[v.name] = v

    op_encoder_result = op_encoder(inputs=from_tensor,
                                   encoder_args=encoder_args,
                                   attention_mask=attention_mask,
                                               max_seq_len + 1,
                                               size=batch_size).astype(
                                                   np.int32)
    memory_sequence_length[np.random.randint(0, batch_size)] = max_seq_len
    embedding_table = np.random.randn(vocab_size, decoder_hidden_dim).astype(
        np_datatype
    ) * initializer_range  # a [vocab_size, decoder_hidden_dim] table

    attention_mask = build_sequence_mask(memory_sequence_length,
                                         num_heads=encoder_head_num,
                                         maximum_length=max_seq_len,
                                         dtype=tf_datatype)

    encoder_args = TransformerArgument(beam_width=1,
                                       head_num=encoder_head_num,
                                       size_per_head=encoder_size_per_head,
                                       num_layer=encoder_num_layer,
                                       dtype=tf_datatype,
                                       remove_padding=remove_padding)

    decoder_args = TransformerArgument(
        beam_width=beam_width,
        head_num=decoder_head_num,
        size_per_head=decoder_size_per_head,
        num_layer=decoder_num_layer,
        dtype=tf_datatype,
        kernel_init_range=kernel_initializer_range,
        bias_init_range=bias_initializer_range,
        fuse_qkv=False)

    decoding_args = DecodingBeamsearchArgument(vocab_size,
                                               start_of_sentence_id,
Beispiel #9
0
def encoder_sample(args_dict):
    print("\n=============== Argument ===============")
    for key in args_dict:
        print("{}: {}".format(key, args_dict[key]))
    print("========================================")

    np.random.seed(1)
    tf.set_random_seed(1)

    batch_size = args_dict['batch_size']
    num_layer = args_dict['num_layer']
    max_seq_len = args_dict['max_seq_len']
    avg_seq_len = args_dict['avg_seq_len']
    head_num = args_dict['head_number']
    size_per_head = args_dict['size_per_head']
    remove_padding = True if args_dict['remove_padding'].lower(
    ) == "true" else False
    tf_datatype = tf.float32
    np_datatype = np.float32
    atol_threshold = 3e-5
    int8_mode = args_dict['int8_mode']
    allow_gemm_test = True if args_dict['allow_gemm_test'].lower(
    ) == "true" else False
    if args_dict['data_type'] == "fp16":
        tf_datatype = tf.float16
        np_datatype = np.float16
        atol_threshold = 3e-2

    hidden_dim = head_num * size_per_head

    sequence_length = np.random.randint(1, max_seq_len + 1, size=batch_size)
    if avg_seq_len != -1 and remove_padding == True:
        # This means we use "remove_padding" and set a smaller average sequence length
        sequence_length = np.ones(batch_size) * avg_seq_len
    else:
        sequence_length = np.ones(batch_size) * (max_seq_len / 2)
    sequence_length = sequence_length.astype(np.int32)

    from_data = np.random.randn(batch_size, max_seq_len, hidden_dim)
    from_tensor = tf.convert_to_tensor(from_data, dtype=tf_datatype)

    attention_mask = build_sequence_mask(sequence_length,
                                         num_heads=head_num,
                                         maximum_length=max_seq_len,
                                         dtype=tf_datatype)

    encoder_args = TransformerArgument(beam_width=1,
                                       head_num=head_num,
                                       size_per_head=size_per_head,
                                       num_layer=num_layer,
                                       dtype=tf_datatype,
                                       remove_padding=remove_padding,
                                       int8_mode=int8_mode,
                                       allow_gemm_test=allow_gemm_test)

    tf_encoder_result = tf_encoder(input_tensor=from_tensor,
                                   encoder_args=encoder_args,
                                   attention_mask=attention_mask)

    encoder_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
    encoder_variables_dict = {}
    for v in encoder_vars:
        encoder_variables_dict[v.name] = v

    op_encoder_result = op_encoder(inputs=from_tensor,
                                   encoder_args=encoder_args,
                                   attention_mask=attention_mask,
                                   encoder_vars_dict=encoder_variables_dict,
                                   sequence_length=sequence_length)
    '''
    Because FasterTransformer skip some computation for the padding parts, 
    if we do not mask these parts, the cross check result would be wrong. 
    '''
    tf_encoder_result = tf_encoder_result * tf.expand_dims(tf.sequence_mask(
        sequence_length, maxlen=max_seq_len, dtype=tf_datatype),
                                                           axis=-1)
    op_encoder_result = op_encoder_result * tf.expand_dims(tf.sequence_mask(
        sequence_length, maxlen=max_seq_len, dtype=tf_datatype),
                                                           axis=-1)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1

    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())
        run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
        run_metadata = tf.RunMetadata()

        for idx, name in enumerate(encoder_variables_dict):
            print((str(idx) + " " + str(name) + " " +
                   str(encoder_variables_dict[name].shape)) + " " +
                  str(encoder_variables_dict[name].dtype))

        print("#################################")
        tf_encoder_result_val = sess.run(tf_encoder_result)
        op_encoder_result_val = sess.run(op_encoder_result)

        cross_check("Encoder TF v.s. FT with tensor input",
                    tf_encoder_result_val, op_encoder_result_val,
                    atol_threshold)
        ''' 
            Use the numpy array as inputs of FasterTransformer OP. 
            
            This method require more time for the op initialization (especially for FP16), 
            but the inference time would be little faster than using tensor as input. 
        '''
        encoder_variables_dict_2 = {}
        for var, val in zip(encoder_vars, sess.run(encoder_vars)):
            encoder_variables_dict_2[var.name] = val

        # op_encoder_result_2 = op_encoder(inputs=from_tensor,
        #                                 encoder_args=encoder_args,
        #                                 attention_mask=attention_mask,
        #                                 encoder_vars_dict=encoder_variables_dict_2,
        #                                 sequence_length=sequence_length)
        # op_encoder_result_val_2 = sess.run(op_encoder_result_2)
        # cross_check("Encoder TF v.s. FT with numpy input", tf_encoder_result_val,
        #             op_encoder_result_val_2, atol_threshold)

        if args_dict['test_time'] == 1:

            ite = 50
            tf_time = time_test(sess, tf_encoder_result, ite)
            op_time = time_test(sess, op_encoder_result, ite)
            # op_time_2 = time_test(sess, op_encoder_result_2, ite)

            print(
                "[INFO] batch_size {} max_seq_len {} {} layer TF-time {:6.2f} ms"
                .format(batch_size, max_seq_len, num_layer, tf_time))
            print(
                "[INFO] batch_size {} max_seq_len {} {} layer FT-OP-tensor-time {:6.2f} ms"
                .format(batch_size, max_seq_len, num_layer, op_time))
            # print("[INFO] batch_size {} max_seq_len {} {} layer FT-OP-numpy-time {:6.2f} ms".format(batch_size, max_seq_len, num_layer, op_time_2))

        return (tf_encoder_result_val.reshape([-1]) -
                op_encoder_result_val.reshape([-1])).max()
Beispiel #10
0
def sample_model(model_name='124M',
                 nsamples=1,
                 batch_size=1,
                 length=12,
                 temperature=1,
                 top_k=4,
                 top_p=0,
                 models_dir='models',
                 data_type='fp32'):
    """Run the sample_model.

    :model_name=124M : String, which model to use
    :nsamples=0 : Number of samples to return, if 0, continues to
     generate samples indefinately.
    :batch_size=1 : Number of batches (only affects speed/memory).
    :length=None : Number of tokens in generated text, if None (default), is
     determined by model hyperparameters
    :temperature=1 : Float value controlling randomness in boltzmann
     distribution. Lower temperature results in less random completions. As the
     temperature approaches zero, the model will become deterministic and
     repetitive. Higher temperature results in more random completions.
    :top_k=4 : Integer value controlling diversity. 1 means only 1 word is
     considered for each step (token), resulting in deterministic completions,
     while 40 means 40 words are considered at each step. 0 (default) is a
     special setting meaning no restrictions. 40 generally is a good value.
     :models_dir : path to parent folder containing model subfolders
     (i.e. contains the <model_name> folder)
    """

    models_dir = os.path.expanduser(os.path.expandvars(models_dir))
    enc = encoder.get_encoder(model_name, models_dir)
    hparams = HParams(n_vocab=0, n_ctx=1024, n_embd=768, n_head=12, n_layer=12)

    with open(os.path.join(models_dir, model_name, 'hparams.json')) as f:
        hparams.override_from_dict(json.load(f))

    if length is None:
        length = hparams.n_ctx
    elif length > hparams.n_ctx:
        raise ValueError("Can't get samples longer than window size: %s" %
                         hparams.n_ctx)

    # start_ids has shape [batch_size, start_len].flatten()
    # start_ids = [15496, 11, 616, 3290, 468,
    #             15496, 11, 616, 3290, 469,
    #             15496, 11, 616, 3290, 470,
    #             15496, 11, 616, 3290, 471]
    start_ids = [enc.encoder['<|endoftext|>'] for i in range(batch_size)]

    with tf.Session(graph=tf.Graph()) as sess:
        saver = tf.train.import_meta_graph("{}/{}/model.ckpt.meta".format(
            models_dir, model_name))
        print("[INFO] restore the model {}/{}".format(models_dir, model_name))
        saver.restore(sess,
                      ("{}/{}/model.ckpt".format(models_dir, model_name)))

        if data_type == 'fp32':
            tf_data_type = tf.float32
        elif data_type == 'fp16':
            tf_data_type = tf.float16
        else:
            assert (False)

        decoder_args = TransformerArgument(beam_width=1,
                                           head_num=hparams.n_head,
                                           size_per_head=hparams.n_embd //
                                           hparams.n_head,
                                           num_layer=hparams.n_layer,
                                           dtype=tf_data_type,
                                           kernel_init_range=0.00,
                                           bias_init_range=0.00)

        decoding_args = DecodingGpt2Argument(hparams.n_vocab,
                                             enc.encoder['<|endoftext|>'],
                                             enc.encoder['<|endoftext|>'],
                                             length + 2, decoder_args, top_k,
                                             top_p, temperature)

        ckpt_dict = {}
        for var in tf.trainable_variables():
            ckpt_dict[var.name] = var
        decoding_vars = tf.trainable_variables()

        op_output = ft_gpt2_op(decoding_vars, decoding_args, batch_size,
                               start_ids)

        generated = 0

        while nsamples == 0 or generated < nsamples:
            print("[INFO] FT op time: {}".format(
                time_test(sess, op_output, iterations=5, warmup=True)))
            op_out = sess.run(op_output)

            for i in range(batch_size):
                generated += 1

                text = enc.decode(op_out[i][1:])
                print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
                print(text)
def translate_sample(args_dict):
    print("\n=============== Argument ===============")
    for key in args_dict:
        print("{}: {}".format(key, args_dict[key]))
    print("========================================")

    np.random.seed(1)
    tf.set_random_seed(1)
    random.seed(1)

    start_of_sentence_id = 1
    end_of_sentence_id = 2

    kernel_initializer_range = 0.02
    bias_initializer_range = 0.02

    batch_size = args_dict['batch_size']
    beam_width = args_dict['beam_width']
    max_seq_len = args_dict['max_seq_len']
    encoder_head_num = args_dict['encoder_head_number']
    encoder_size_per_head = args_dict['encoder_size_per_head']
    decoder_head_num = args_dict['decoder_head_number']
    decoder_size_per_head = args_dict['decoder_size_per_head']
    encoder_num_layer = args_dict['encoder_num_layer']
    decoder_num_layer = args_dict['decoder_num_layer']
    encoder_hidden_dim = encoder_head_num * encoder_size_per_head
    decoder_hidden_dim = decoder_head_num * decoder_size_per_head
    tf_datatype = tf.float32
    if args_dict['data_type'] == "fp16":
        tf_datatype = tf.float16
    beam_search_diversity_rate = args_dict['beam_search_diversity_rate']
    sampling_topk = args_dict['sampling_topk']
    sampling_topp = args_dict['sampling_topp']

    source_inputter = WordEmbedder("source_vocabulary",
                                   embedding_size=encoder_hidden_dim,
                                   dtype=tf_datatype)
    target_inputter = WordEmbedder("target_vocabulary",
                                   embedding_size=decoder_hidden_dim,
                                   dtype=tf_datatype)
    inputter = ExampleInputter(source_inputter, target_inputter)
    inputter.initialize({
        "source_vocabulary": args_dict['source_vocabulary'],
        "target_vocabulary": args_dict['target_vocabulary']
    })
    vocab_size = target_inputter.vocabulary_size
    source_file = args_dict['source']
    is_remove_padding = True if args_dict['remove_padding'].lower(
    ) == "true" else False

    encoder_args = TransformerArgument(
        beam_width=1,
        head_num=encoder_head_num,
        size_per_head=encoder_size_per_head,
        num_layer=encoder_num_layer,
        dtype=tf_datatype,
        kernel_init_range=kernel_initializer_range,
        bias_init_range=bias_initializer_range,
        remove_padding=is_remove_padding)

    decoder_args = TransformerArgument(
        beam_width=beam_width,
        head_num=decoder_head_num,
        size_per_head=decoder_size_per_head,
        num_layer=decoder_num_layer,
        dtype=tf_datatype,
        kernel_init_range=kernel_initializer_range,
        bias_init_range=bias_initializer_range,
        memory_hidden_dim=encoder_head_num * encoder_size_per_head)

    decoder_args_2 = copy.deepcopy(decoder_args)  # for beam search
    decoder_args_2.__dict__ = copy.deepcopy(decoder_args.__dict__)
    decoder_args_2.beam_width = 1  # for sampling

    decoding_beamsearch_args = DecodingBeamsearchArgument(
        vocab_size, start_of_sentence_id, end_of_sentence_id, max_seq_len,
        decoder_args, beam_search_diversity_rate)

    decoding_sampling_args = DecodingSamplingArgument(
        vocab_size, start_of_sentence_id, end_of_sentence_id, max_seq_len,
        decoder_args_2, sampling_topk, sampling_topp)

    with tf.variable_scope("transformer/encoder", reuse=tf.AUTO_REUSE):
        dataset = inputter.make_inference_dataset(source_file, batch_size)
        iterator = dataset.make_initializable_iterator()
        source = iterator.get_next()
        source_embedding = source_inputter.make_inputs(source)
        source_embedding = tf.cast(source_embedding, tf_datatype)
        memory_sequence_length = source["length"]

        tf_encoder_result = tf_encoder_opennmt(
            source_embedding,
            encoder_args,
            sequence_length=memory_sequence_length)

        encoder_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
        encoder_variables_dict = {}
        for v in encoder_vars:
            encoder_variables_dict[v.name] = v

        ft_encoder_result = ft_encoder_opennmt(
            inputs=source_embedding,
            encoder_args=encoder_args,
            encoder_vars_dict=encoder_variables_dict,
            sequence_length=memory_sequence_length)

    tf_encoder_result = tf.reshape(tf_encoder_result,
                                   tf.shape(source_embedding))
    ft_encoder_result = tf.reshape(ft_encoder_result,
                                   tf.shape(source_embedding))

    with tf.variable_scope("transformer/decoder", reuse=tf.AUTO_REUSE):
        target_inputter.build()
    target_vocab_rev = target_inputter.vocabulary_lookup_reverse()

    ### TF BeamSearch Decoding ###
    tf_beamsearch_target_ids, tf_beamsearch_target_length, _, _, _ = tf_beamsearch_decoding(
        tf_encoder_result,
        memory_sequence_length,
        target_inputter.embedding,
        decoding_beamsearch_args,
        decoder_type=0)

    # tf_beamsearch_target_tokens: [batch_size, beam_width, seq_len]
    tf_beamsearch_target_tokens = target_vocab_rev.lookup(
        tf.cast(tf_beamsearch_target_ids, tf.int64))
    tf_beamsearch_target_length = tf.minimum(
        tf_beamsearch_target_length + 1,
        tf.shape(tf_beamsearch_target_ids)[-1])
    ### end of TF BeamSearch Decoding ###

    ### TF Sampling Decoding ###
    tf_sampling_target_ids, tf_sampling_target_length = tf_sampling_decoding(
        tf_encoder_result,
        memory_sequence_length,
        target_inputter.embedding,
        decoding_sampling_args,
        decoder_type=0)

    # tf_sampling_target_tokens: [batch_size, seq_len]
    tf_sampling_target_tokens = target_vocab_rev.lookup(
        tf.cast(tf_sampling_target_ids, tf.int64))
    tf_sampling_target_length = tf.minimum(
        tf_sampling_target_length + 1,
        tf.shape(tf_sampling_target_ids)[-1])
    ### end of TF BeamSearch Decoding ###

    ### OP BeamSearch Decoder ###
    op_decoder_beamsearch_target_ids, op_decoder_beamsearch_target_length, _, _, _ = tf_beamsearch_decoding(
        tf_encoder_result,
        memory_sequence_length,
        target_inputter.embedding,
        decoding_beamsearch_args,
        decoder_type=1)

    # op_decoder_beamsearch_target_tokens: [batch_size, beam_width, seq_len]
    op_decoder_beamsearch_target_tokens = target_vocab_rev.lookup(
        tf.cast(op_decoder_beamsearch_target_ids, tf.int64))
    op_decoder_beamsearch_target_length = tf.minimum(
        op_decoder_beamsearch_target_length + 1,
        tf.shape(op_decoder_beamsearch_target_ids)[-1])
    ### end of OP BeamSearch Decoder ###

    ### OP Sampling Decoder ###
    op_decoder_sampling_target_ids, op_decoder_sampling_target_length = tf_sampling_decoding(
        tf_encoder_result,
        memory_sequence_length,
        target_inputter.embedding,
        decoding_sampling_args,
        decoder_type=1)

    op_decoder_sampling_target_tokens = target_vocab_rev.lookup(
        tf.cast(op_decoder_sampling_target_ids, tf.int64))
    op_decoder_sampling_target_length = tf.minimum(
        op_decoder_sampling_target_length + 1,
        tf.shape(op_decoder_sampling_target_ids)[-1])
    ### end of OP BeamSearch Decoder ###

    ### Prepare Decoding variables for FasterTransformer  ###
    all_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
    decoder_var_start_id = 0

    while all_vars[decoder_var_start_id].name.find(
            "transformer/decoder") == -1:
        decoder_var_start_id += 1
    decoder_variables = all_vars[
        decoder_var_start_id +
        1:]  # decoder_var_start_id + 1 means skip the embedding table

    ### OP BeamSearch Decoding ###
    op_beamsearch_target_ids, op_beamsearch_target_length, _, _, _ = op_beamsearch_decoding(
        ft_encoder_result, memory_sequence_length, target_inputter.embedding,
        decoder_variables, decoding_beamsearch_args)

    op_beamsearch_target_tokens = target_vocab_rev.lookup(
        tf.cast(op_beamsearch_target_ids, tf.int64))
    op_beamsearch_target_length = tf.minimum(
        op_beamsearch_target_length + 1,
        tf.shape(op_beamsearch_target_ids)[-1])
    ### end of OP BeamSearch Decoding ###

    ### OP Sampling Decoding ###
    op_sampling_target_ids, op_sampling_target_length = op_sampling_decoding(
        ft_encoder_result, memory_sequence_length, target_inputter.embedding,
        decoder_variables, decoding_sampling_args)

    op_sampling_target_tokens = target_vocab_rev.lookup(
        tf.cast(op_sampling_target_ids, tf.int64))
    op_sampling_target_length = tf.minimum(
        op_sampling_target_length + 1,
        tf.shape(op_sampling_target_ids)[-1])
    ### end of OP Sampling Decoding ###

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    time_args = args_dict['test_time']

    class TranslationResult(object):
        def __init__(self, token_op, length_op, name):
            self.token_op = token_op
            self.length_op = length_op
            self.name = name
            self.file_name = name + ".txt"

            self.token_list = []
            self.length_list = []
            self.batch_num = 0
            self.execution_time = 0.0  # seconds
            self.sentence_num = 0
            self.bleu_score = None

    translation_result_list = []

    if time_args.find("0") != -1:
        translation_result_list.append(
            TranslationResult(tf_beamsearch_target_tokens,
                              tf_beamsearch_target_length,
                              "tf-decoding-beamsearch"))
    if time_args.find("1") != -1:
        translation_result_list.append(
            TranslationResult(op_decoder_beamsearch_target_tokens,
                              op_decoder_beamsearch_target_length,
                              "op-decoder-beamsearch"))
    if time_args.find("2") != -1:
        translation_result_list.append(
            TranslationResult(op_beamsearch_target_tokens,
                              op_beamsearch_target_length,
                              "op-decoding-beamsearch"))
    if time_args.find("3") != -1:
        translation_result_list.append(
            TranslationResult(tf_sampling_target_tokens,
                              tf_sampling_target_length,
                              "tf-decoding-sampling"))
    if time_args.find("4") != -1:
        translation_result_list.append(
            TranslationResult(op_decoder_sampling_target_tokens,
                              op_decoder_sampling_target_length,
                              "op-decoder-sampling"))
    if time_args.find("5") != -1:
        translation_result_list.append(
            TranslationResult(op_sampling_target_tokens,
                              op_sampling_target_length,
                              "op-decoding-sampling"))

    float_var_list = []
    half_var_list = []
    for var in tf.global_variables()[:-1]:
        if var.dtype.base_dtype == tf.float32:
            float_var_list.append(var)
        elif var.dtype.base_dtype == tf.float16:
            half_var_list.append(var)

    if (len(translation_result_list) == 0):
        print("[WARNING] No put any test cases.")

    cuda_profiler = cudaProfiler()
    cuda_profiler.start()
    for i in range(len(translation_result_list)):
        with tf.Session(config=config) as sess:
            sess.run(tf.global_variables_initializer())
            sess.run(tf.tables_initializer())
            sess.run(iterator.initializer)

            if (len(float_var_list) > 0):
                float_saver = tf.train.Saver(float_var_list)
                float_saver.restore(sess, "translation/ckpt/model.ckpt-500000")
            if (len(half_var_list) > 0):
                half_saver = tf.train.Saver(half_var_list)
                half_saver.restore(sess,
                                   "translation/ckpt/fp16_model.ckpt-500000")

            t1 = datetime.now()
            while True:
                try:
                    batch_tokens, batch_length = sess.run([
                        translation_result_list[i].token_op,
                        translation_result_list[i].length_op
                    ])
                    for tokens, length in zip(batch_tokens, batch_length):
                        if translation_result_list[i].name.find(
                                "beamsearch") != -1:
                            translation_result_list[i].token_list.append(
                                b" ".join(tokens[0][:length[0] -
                                                    2]).decode("UTF-8"))
                        else:
                            translation_result_list[i].token_list.append(
                                b" ".join(tokens[:length - 2]).decode("UTF-8"))
                    translation_result_list[i].batch_num += 1
                except tf.errors.OutOfRangeError:
                    break
            t2 = datetime.now()
            time_sum = (t2 - t1).total_seconds()
            translation_result_list[i].execution_time = time_sum

            with open(translation_result_list[i].file_name, "w") as file_b:
                for s in translation_result_list[i].token_list:
                    file_b.write(s)
                    file_b.write("\n")

            ref_file_path = "./.ref_file.txt"
            os.system("head -n %d %s > %s" %
                      (len(translation_result_list[i].token_list),
                       args_dict['target'], ref_file_path))
            translation_result_list[i].bleu_score = bleu_score(
                translation_result_list[i].file_name, ref_file_path)
            os.system("rm {}".format(ref_file_path))

            time.sleep(60)
    cuda_profiler.stop()

    for t in translation_result_list:
        print(
            "[INFO] {} translates {} batches taking {:.2f} sec to translate {} tokens, BLEU score: {:.2f}, {:.0f} tokens/sec."
            .format(t.name, t.batch_num, t.execution_time,
                    t.bleu_score.sys_len, t.bleu_score.score,
                    t.bleu_score.sys_len / t.execution_time))

    return translation_result_list