Ejemplo n.º 1
0
    def __init__(self):
        super(UnifiedTransformer, self).__init__()

        self.model = UnifiedTransformerLMHeadModel.from_pretrained(
            'unified_transformer-12L-cn')
        self.tokenizer = UnifiedTransformerTokenizer.from_pretrained(
            'unified_transformer-12L-cn')
        self._interactive_mode = False
Ejemplo n.º 2
0
def infer(args):
    paddle.set_device(args.device)
    set_seed(args.seed)

    model = UnifiedTransformerLMHeadModel.from_pretrained(
        args.model_name_or_path)
    tokenizer = UnifiedTransformerTokenizer.from_pretrained(
        args.model_name_or_path)

    test_ds = load_dataset('duconv', splits='test_1')
    test_ds, test_data_loader = create_data_loader(test_ds, tokenizer, args,
                                                   'test')

    model.eval()
    total_time = 0.0
    start_time = time.time()
    pred_responses = []
    for step, inputs in enumerate(test_data_loader, 1):
        input_ids, token_type_ids, position_ids, attention_mask, seq_len = inputs
        output = model.generate(input_ids=input_ids,
                                token_type_ids=token_type_ids,
                                position_ids=position_ids,
                                attention_mask=attention_mask,
                                seq_len=seq_len,
                                max_length=args.max_dec_len,
                                min_length=args.min_dec_len,
                                decode_strategy=args.decode_strategy,
                                temperature=args.temperature,
                                top_k=args.top_k,
                                top_p=args.top_p,
                                num_beams=args.num_beams,
                                length_penalty=args.length_penalty,
                                early_stopping=args.early_stopping,
                                num_return_sequences=args.num_return_sequences,
                                use_fp16_decoding=args.use_fp16_decoding,
                                use_faster=args.faster)

        total_time += (time.time() - start_time)
        if step % args.logging_steps == 0:
            print('step %d - %.3fs/step' %
                  (step, total_time / args.logging_steps))
            total_time = 0.0

        ids, scores = output
        results = select_response(ids, scores, tokenizer, args.max_dec_len,
                                  args.num_return_sequences)
        pred_responses.extend(results)

        start_time = time.time()

    with open(args.output_path, 'w', encoding='utf-8') as fout:
        for response in pred_responses:
            fout.write(response + '\n')
    print('\nSave inference result into: %s' % args.output_path)

    target_responses = [example['response'] for example in test_ds]
    calc_bleu_and_distinct(pred_responses, target_responses)
Ejemplo n.º 3
0
def infer(args):
    model_name = 'plato-xl'
    model = UnifiedTransformerLMHeadModel.from_pretrained(model_name)
    tokenizer = UnifiedTransformerTokenizer.from_pretrained(model_name)

    context = [
        "Hi , Becky , what's up ?",
        "Not much , except that my mother-in-law is driving me up the wall .",
        "What's the problem ?"
    ]

    data = tokenizer.dialogue_encode(history=context,
                                     add_start_token_as_response=True,
                                     return_length=True,
                                     return_role_ids=args.use_role,
                                     position_style=args.position_style)

    for name in data:
        if name == "attention_mask":
            data[name] = paddle.to_tensor(data[name], dtype="float32").reshape(
                [1, 1, 41, 41])
        else:
            data[name] = paddle.to_tensor(data[name],
                                          dtype="int64").reshape([1, -1])

    for i in range(200):
        if 100 == i:
            paddle.device.cuda.synchronize()
            start = time.time()

        outputs, _ = model.generate(input_ids=data['input_ids'],
                                    token_type_ids=data['token_type_ids'],
                                    position_ids=data['position_ids'],
                                    attention_mask=data['attention_mask'],
                                    role_ids=data.get('role_ids', None),
                                    seq_len=data['seq_len'],
                                    max_length=args.max_out_len,
                                    min_length=args.min_out_len,
                                    decode_strategy=args.decoding_strategy,
                                    top_k=args.topk,
                                    top_p=args.topp,
                                    num_beams=args.num_beams,
                                    use_fp16_decoding=args.use_fp16_decoding,
                                    use_faster=args.faster)

    paddle.device.cuda.synchronize()
    print(
        "Average time of FasterGeneration of PLATO-XL model is {}ms. ".format(
            (time.time() - start) / 100 * 1000))

    result = postprocess_response(outputs[0].numpy(), tokenizer)
    result = " ".join(result)

    print("Model input:", context)
    print("Result:", result)
Ejemplo n.º 4
0
def do_predict(args):
    place = "gpu"
    place = paddle.set_device(place)

    if args.use_fp16_decoding and os.getenv("PPFG_QKV_MEM_OPT", "0") == "1":
        paddle.set_default_dtype("float16")

    model_name = 'plato-xl'
    model = UnifiedTransformerLMHeadModel.from_pretrained(
        model_name, load_state_as_np=True)
    tokenizer = UnifiedTransformerTokenizer.from_pretrained(model_name)

    plato = FasterUnifiedTransformer(model=model,
                                     use_fp16_decoding=args.use_fp16_decoding)
    # Set evaluate mode
    plato.eval()

    # Convert dygraph model to static graph model
    plato = paddle.jit.to_static(
        plato,
        input_spec=[
            # input_ids
            paddle.static.InputSpec(shape=[None, None], dtype="int32"),
            # token_type_ids
            paddle.static.InputSpec(shape=[None, None], dtype="int32"),
            # attention_mask
            paddle.static.InputSpec(shape=[None, 1, None, None],
                                    dtype="float32"),
            # seq_len
            paddle.static.InputSpec(shape=[None], dtype="int32"),
            # role_ids
            paddle.static.InputSpec(shape=[None, None], dtype="int32"),
            # position_ids
            paddle.static.InputSpec(shape=[None, None], dtype="int32"),
            args.max_out_len,
            args.min_out_len,
            args.topk,
            args.topp,
            args.decoding_strategy,
            tokenizer.cls_token_id,  # cls/bos
            tokenizer.sep_token_id,  # sep/eos
            tokenizer.pad_token_id,  # pad
            args.num_beams,  # num_beams. Used for beam_search. 
            args.diversity_rate,  # diversity rate. Used for beam search. 
            args.temperature,
            args.num_return_sequences,
        ])

    # Save converted static graph model
    paddle.jit.save(plato, os.path.join(args.inference_model_dir, "plato"))
    logger.info("PLATO has been saved to {}.".format(args.inference_model_dir))
Ejemplo n.º 5
0
def main(args):
    paddle.set_device(args.device)
    if args.seed is not None:
        set_seed(args.seed)

    # Initialize the model and tokenizer
    model_name_or_path = 'plato-mini'
    model = UnifiedTransformerLMHeadModel.from_pretrained(
        args.model_name_or_path)
    tokenizer = UnifiedTransformerTokenizer.from_pretrained(
        args.model_name_or_path)

    model.eval()
    interaction(args, model, tokenizer)
Ejemplo n.º 6
0
def main(args):
    paddle.set_device('gpu' if args.n_gpus else 'cpu')
    paddle.seed(args.seed)

    model = UnifiedTransformerLMHeadModel.from_pretrained(
        args.model_name_or_path)
    tokenizer = UnifiedTransformerTokenizer.from_pretrained(
        args.model_name_or_path)

    test_dataset = DialogueDataset(args.test_data_path,
                                   args.infer_batch_size,
                                   tokenizer.pad_token_id,
                                   tokenizer.cls_token_id,
                                   mode='test')
    test_dataloader = DataLoader(test_dataset,
                                 return_list=True,
                                 batch_size=None)

    infer(model, test_dataloader, tokenizer)
Ejemplo n.º 7
0
def train(args):
    paddle.set_device(args.device)
    world_size = dist.get_world_size()
    if world_size > 1:
        dist.init_parallel_env()

    set_seed(args.seed)

    model = UnifiedTransformerLMHeadModel.from_pretrained(
        args.model_name_or_path)
    tokenizer = UnifiedTransformerTokenizer.from_pretrained(
        args.model_name_or_path)

    if world_size > 1:
        model = paddle.DataParallel(model)

    train_ds, dev_ds = load_dataset('duconv', splits=('train', 'dev'))
    train_ds, train_data_loader = create_data_loader(train_ds, tokenizer, args,
                                                     'train')
    dev_ds, dev_data_loader = create_data_loader(dev_ds, tokenizer, args,
                                                 'dev')

    lr_scheduler = NoamDecay(1 / (args.warmup_steps * (args.lr**2)),
                             args.warmup_steps)
    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = AdamW(learning_rate=lr_scheduler,
                      parameters=model.parameters(),
                      weight_decay=args.weight_decay,
                      apply_decay_param_fun=lambda x: x in decay_params,
                      grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm))

    step = 0
    total_time = 0.0
    best_ppl = 1e9
    for epoch in range(args.epochs):
        print('\nEpoch %d/%d' % (epoch + 1, args.epochs))
        batch_start_time = time.time()
        for inputs in train_data_loader:
            step += 1
            labels = inputs[-1]

            logits = model(*inputs[:-1])
            loss = F.cross_entropy(logits, labels)
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()

            total_time += (time.time() - batch_start_time)
            if step % args.logging_steps == 0:
                ppl = paddle.exp(loss)
                print(
                    'step %d - loss: %.4f - ppl: %.4f - lr: %.7f - %.3fs/step'
                    % (step, loss, ppl, optimizer.get_lr(),
                       total_time / args.logging_steps))
                total_time = 0.0
            if step % args.save_steps == 0:
                ppl = evaluation(model, dev_data_loader)
                if dist.get_rank() == 0:
                    save_ckpt(model, tokenizer, args.save_dir, step)
                    if ppl < best_ppl:
                        best_ppl = ppl
                        save_ckpt(model, tokenizer, args.save_dir, 'best')
                        print('Saved step {} as best model.\n'.format(step))
            batch_start_time = time.time()
    print('\nTraining completed.')
Ejemplo n.º 8
0
def main(args):
    paddle.set_device('gpu' if args.n_gpus else 'cpu')
    paddle.seed(args.seed)
    world_size = dist.get_world_size()
    rank = dist.get_rank()
    if world_size > 1:
        dist.init_parallel_env()

    model = UnifiedTransformerLMHeadModel.from_pretrained(
        args.model_name_or_path)
    tokenizer = UnifiedTransformerTokenizer.from_pretrained(
        args.model_name_or_path)
    if world_size > 1:
        model = paddle.DataParallel(model)

    train_dataset = DialogueDataset(args.train_data_path,
                                    args.batch_size,
                                    tokenizer.pad_token_id,
                                    tokenizer.cls_token_id,
                                    args.sort_pool_size,
                                    args.seed,
                                    mode='train')
    train_dataloader = DataLoader(train_dataset,
                                  return_list=True,
                                  batch_size=None)
    valid_dataset = DialogueDataset(args.valid_data_path,
                                    args.batch_size,
                                    tokenizer.pad_token_id,
                                    tokenizer.cls_token_id,
                                    args.sort_pool_size,
                                    mode='valid')
    valid_dataloader = DataLoader(valid_dataset,
                                  return_list=True,
                                  batch_size=None)

    lr_scheduler = NoamDecay(1 / (args.warmup_steps * (args.lr**2)),
                             args.warmup_steps)
    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = AdamW(learning_rate=lr_scheduler,
                      parameters=model.parameters(),
                      weight_decay=args.weight_decay,
                      apply_decay_param_fun=lambda x: x in decay_params,
                      grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm))

    step = 0
    total_time = 0.0
    for epoch in range(args.epochs):
        if rank == 0:
            print('\nEpoch %d/%d' % (epoch + 1, args.epochs))
        batch_start_time = time.time()
        for inputs in train_dataloader:
            step += 1
            token_ids, type_ids, pos_ids, generation_mask, tgt_label, tgt_pos = inputs

            logits = model(token_ids, type_ids, pos_ids, generation_mask,
                           tgt_pos)
            loss = F.cross_entropy(logits, tgt_label)
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()

            total_time += (time.time() - batch_start_time)
            if rank == 0:
                if step % args.logging_steps == 0:
                    ppl = paddle.exp(loss)
                    print(
                        'step %d - loss: %.4f - ppl: %.4f - lr: %.7f - %.3fs/step'
                        % (step, loss, ppl, optimizer.get_lr(),
                           total_time / args.logging_steps))
                    total_time = 0.0
                if step % args.save_steps == 0:
                    evaluation(model, valid_dataloader)
                    save_ckpt(model, tokenizer, args.save_dir, step)
            batch_start_time = time.time()
Ejemplo n.º 9
0
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from paddlenlp.transformers import UnifiedTransformerLMHeadModel, UnifiedTransformerTokenizer

model_name = 'plato-mini'

tokenizer = UnifiedTransformerTokenizer.from_pretrained(model_name)
model = UnifiedTransformerLMHeadModel.from_pretrained(model_name)
model.eval()


def postprocess_response(token_ids, tokenizer):
    """Post-process the decoded sequence. Truncate from the first <eos>."""
    eos_pos = len(token_ids)
    for i, tok_id in enumerate(token_ids):
        if tok_id == tokenizer.sep_token_id:
            eos_pos = i
            break
    token_ids = token_ids[:eos_pos]
    tokens = tokenizer.convert_ids_to_tokens(token_ids)
    tokens = tokenizer.merge_subword(tokens)
    return tokens
Ejemplo n.º 10
0
def main(args):
    # For memory saving when using FasterGeneration:
    # If environment variable `PPFG_QKV_MEM_OPT` is set and the weights of q/k/v
    # is fused, it will try to delete the original unfused weights. Note the
    # rollback to original model would not be guarantee anymore when the faster
    # model failed if the original weights are deleted.
    os.environ["PPFG_QKV_MEM_OPT"] = "1"
    if args.use_fp16:
        paddle.set_default_dtype("float16")
    enable_ft_para()
    # TODO(guosheng): Maybe device can be set in `enable_ft_para`
    paddle.set_device("gpu:" + str(get_ft_para_conf().rank))

    if args.profile:
        UnifiedTransformerLMHeadModel.generate = profile(args.batch_size)(
            UnifiedTransformerLMHeadModel.generate)
    tokenizer = UnifiedTransformerTokenizer.from_pretrained("plato-xl")
    model = UnifiedTransformerLMHeadModel.from_pretrained(
        "plato-xl", load_state_as_np=True)
    model.eval()

    history = [
        "hi , Mary ! What do you usually like to do in your spare time ?",
        "well , I spend a lot of time watching movies .",
        "what a confidence ! I always watch a lot of movies , too ."
        "oh really , Frank ? What kind of movies do you like ?"
    ]
    inputs = [history] * args.batch_size
    inputs = list(
        map(
            lambda history: tokenizer.dialogue_encode(
                history=history,
                add_start_token_as_response=True,
                return_length=True,
                return_role_ids=args.use_role,
                position_style=args.position_style), inputs))
    collator = DataCollatorWithPadding(tokenizer)
    data = collator(inputs)

    outputs, _ = model.generate(
        input_ids=data['input_ids'],
        token_type_ids=data['token_type_ids'],
        position_ids=data['position_ids'],
        attention_mask=data['attention_mask'].cast(
            "float32"),  # TODO(guosheng): remove this cast
        role_ids=data.get('role_ids', None),
        seq_len=data['seq_len'],
        max_length=args.max_out_len,
        min_length=args.min_out_len,
        decode_strategy='sampling',
        top_k=args.topk,
        top_p=args.topp,
        temperature=args.temperature,
        num_return_sequences=args.num_return_sequences,
        use_faster=True,
        use_fp16_decoding=args.use_fp16)

    # Only make the first process to output.
    if get_ft_para_conf().rank == 0:
        for i in range(len(outputs)):
            result = postprocess_response(outputs[i].numpy(), tokenizer)
            print("Result:", result)