Esempio n. 1
0
def create_model_transformer_b2a(arg, devices_list, eval=False):
    from models import Transformer
    resume_dataset = arg.eval_dataset_transformer if eval else arg.dataset
    resume_b = arg.eval_split_source_trasformer if eval else arg.split_source
    resume_a = arg.eval_split_trasformer if eval else arg.split
    resume_epoch = arg.eval_epoch_transformer if eval else arg.resume_epoch

    transformer = Transformer(in_channels=boundary_num,
                              out_channels=boundary_num)

    if resume_epoch > 0:
        load_path = arg.resume_folder + 'transformer_' + resume_dataset + '_' + resume_b + '2' + resume_a + '_' + str(
            resume_epoch) + '.pth'
        print('Loading Transformer from ' + load_path)
        transformer = load_weights(transformer, load_path, devices_list[0])
    else:
        init_weights(transformer, init_type='transformer')
        # init_weights(transformer)

    if arg.cuda:
        transformer = transformer.cuda(device=devices_list[0])

    return transformer
Esempio n. 2
0
                      trg_vocab=trg_vocab,
                      d_model=args.d_model,
                      num=args.num,
                      n_heads=args.n_heads)
    for p in net.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    criterion = nn.CrossEntropyLoss(reduction="mean", ignore_index=1)
    optimizer = optim.Adam(net.parameters(),
                           lr=args.lr,
                           betas=(0.9, 0.98),
                           eps=1e-9)
    #scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10,20,30,40,50,100,200], gamma=0.7)
    scheduler = CosineWithRestarts(optimizer, T_max=500)
    if cuda:
        net.cuda()
    start_epoch, acc = load_state(net,
                                  optimizer,
                                  scheduler,
                                  args.model_no,
                                  load_best=False)
    losses_per_epoch, accuracy_per_epoch = load_results(model_no=args.model_no)

    logger.info("Starting training process...")
    for e in range(start_epoch, args.num_epochs):
        net.train()
        losses_per_batch = []
        total_loss = 0.0
        for i, data in enumerate(train_iter):
            trg_input = data.FR[:, :-1]
            labels = data.FR[:, 1:].contiguous().view(-1)
Esempio n. 3
0
        norm_pos = mcfg['norm_pos']
        seq2seq = Transformer(in_dim, out_dim, max_len, d_model, d_ff,
                              n_layers, n_heads, dropout, norm_pos)
    elif model_type == 'dynamic_conv':
        conv_type = mcfg['conv_type']
        kernel_sizes = mcfg['kernel_sizes']
        d_model = mcfg['d_model']
        d_ff = mcfg['d_ff']
        n_heads = mcfg['n_heads']
        dropout = mcfg['dropout']
        norm_pos = mcfg['norm_pos']
        seq2seq = DynamicConvS2S(in_dim, out_dim, max_len, conv_type,
                                 kernel_sizes, d_model, d_ff, n_heads, dropout,
                                 norm_pos)

    seq2seq.cuda()
    K = 1024
    n_params = utils.num_params(seq2seq) / K / K
    logger.nofmt(seq2seq)
    logger.info("# of params = {:.1f} M".format(n_params))

    # parameter size tracing
    if args.param_tracing:
        # sequential tracing
        #  for name, p in seq2seq.named_parameters():
        #      numel = p.numel()
        #      unit = 'M'
        #      numel /= 1024*1024
        #      fmt = "10.3f" if numel < 1.0 else "10.1f"

        #      print("{:50s}\t{:{fmt}}{}".format(name, numel, unit, fmt=fmt))