Esempio n. 1
0
class Translation(object):
    def __init__(self, args):
        super(Translation, self).__init__()
        self.datasets = {}
        self.data_dir = args.data_dir

        self.src_lang, self.trg_lang = dataset_utils.infer_language_pair(
            args.data_dir)

        src_dict_path = os.path.join(args.data_dir,
                                     dict_path.format(self.src_lang))
        trg_dict_path = os.path.join(args.data_dir,
                                     dict_path.format(self.trg_lang))
        self.src_dict = Dictionary.build_from_dict_file(src_dict_path)
        self.trg_dict = Dictionary.build_from_dict_file(trg_dict_path)

        self.model = None
        self.criterion = None
        self.optimizer = None

    def load_dataset(self, split):
        # 根据split找到路径
        src_split_path = os.path.join(
            self.data_dir,
            subset_path.format(split, self.src_lang, self.trg_lang,
                               self.src_lang))
        trg_split_path = os.path.join(
            self.data_dir,
            subset_path.format(split, self.src_lang, self.trg_lang,
                               self.trg_lang))

        src_dataset = SingleDataset(src_split_path)
        trg_dataset = SingleDataset(trg_split_path)
        pair_dataset = PairDataset(src_dataset, trg_dataset)
        self.datasets[split] = pair_dataset

    def build_model(self, args):
        encoder_embed_tokens = nn.Embedding(
            self.src_dict.token_num,
            args.encoder_embed_dim,
            padding_idx=self.src_dict.padding_idx)
        if args.share_all_embeddings:
            decoder_embed_tokens = encoder_embed_tokens
        else:
            decoder_embed_tokens = nn.Embedding(
                self.trg_dict.token_num,
                args.decoder_embed_dim,
                padding_idx=self.trg_dict.padding_idx)
        self.model = Transformer(args, self.src_dict, self.trg_dict)

    def build_criterion(self, label_smooth):
        self.criterion = LabelSmoothedCrossEntropyCriterion(label_smooth)

    def build_optimizer(self):
        if self.model is None:
            print("should build model first!")
        else:
            self.optimizer = CustomAdam(self.model.parameters(),
                                        lr=self.args.lr,
                                        betas=self.args.betas)
Esempio n. 2
0
def train():
    inputs, src_vocab_size, tgt_vocab_size, idx2word = create_data()

    enc_inputs, dec_inputs, dec_outputs = make_data(*inputs)
    data_loader = Data.DataLoader(dataset=MyDataSet(enc_inputs, dec_inputs, dec_outputs),
                                  batch_size=2,
                                  shuffle=True)

    model = Transformer(src_vocab_size, tgt_vocab_size).cuda()
    criterion = nn.CrossEntropyLoss(ignore_index=0)  # PAD本身无意义,单词索引为0,设置ignore_index=0,可避免计算PAD的损失
    optimizer = optim.SGD(model.parameters(), lr=1e-3, momentum=0.09)

    for epoch in range(30):
        for enc_inputs, dec_inputs, dec_outputs in data_loader:
            """
            enc_inputs: [batch_size, src_len]
            dec_inputs: [batch_size, tgt_len]
            dec_outputs: [batch_size, tgt_len]
            """

            enc_inputs, dec_inputs, dec_outputs = enc_inputs.cuda(), dec_inputs.cuda(), dec_outputs.cuda()

            outputs, enc_self_attns, dec_self_attns, dec_enc_attns = model(enc_inputs, dec_inputs)
            loss = criterion(outputs, dec_outputs.view(-1))

            print('Epoch:', '%04d' % (epoch + 1), 'loss =', '{:.6f}'.format(loss))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
Esempio n. 3
0
    def instantiate_model(self,
                          english_vocab_size,
                          norwegian_vocab_size,
                          embedding_dim=256,
                          num_heads=8,
                          num_encoders=6,
                          ff_dim=256):
        model = Transformer(english_vocab_size, norwegian_vocab_size,
                            embedding_dim, num_heads, num_encoders, ff_dim,
                            self.cuda).to(self.cuda)

        for p in model.parameters():
            if p.dim() > 1:
                torch.nn.init.xavier_uniform(p)

        return model
def main(args):
	torch.manual_seed(args.seed)


	train_loader, test_loader = data_generator(args.data_dir, args.batch_size)


	for m in range(len(models)):


		if(models[m]=="Transformer"):

			model = Transformer(args.NumFeatures,args.NumTimeSteps,args.n_layers, args.heads, args.dropout,args.n_classes,time=args.NumTimeSteps)

		elif(models[m]=="TCN"):
			channel_sizes = [args.nhid] * args.levels
			model = TCN(args.NumFeatures, args.n_classes, channel_sizes, kernel_size=args.ksize, dropout=args.dropout)
		elif(models[m]=="LSTMWithInputCellAttention"):
			model = LSTMWithInputCellAttention(args.NumFeatures,  args.nhid,args.n_classes,args.dropout,args.attention_hops,args.d_a)
		elif(models[m]=="LSTM"):
			model = LSTM(args.NumFeatures,  args.nhid, args.n_classes,args.dropout)




		model.to(device)
		model_name = "model_{}_NumFeatures_{}".format(models[m],args.NumFeatures)


		model_filename = args.model_dir + 'm_' + model_name + '.pt'

		lr=args.lr
		optimizer = getattr(optim, args.optim)(model.parameters(), lr=lr)

		best_test_loss=100
		for epoch in range(1, args.epochs+1):
			model,optimizer = train(args,epoch,model,train_loader,optimizer)
			test_loss,test_acc = test(args,model,test_loader)
			if(test_loss<best_test_loss):
				best_test_loss = test_loss
				save(model, model_filename)
			if(test_acc>=99):
				break
			if epoch % 10 == 0:
				lr /= 10
				for param_group in optimizer.param_groups:
					param_group['lr'] = lr
Esempio n. 5
0
def main():
    parser = argparse.ArgumentParser(description="Train the model")
    parser.add_argument('-data', required=True)
    parser.add_argument('-epoch', type=int, default=10)
    parser.add_argument('-batch_size', type=int, default=64)
    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-no_cuda', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model

    # Load data
    data = torch.load(opt.data)

    opt.max_token_seq_len = data['settings'].max_word_seq_len + 2

    training_data, validation_data = prepare_dataloaders(data, opt)

    opt.src_vocab_size = training_data.dataset.src_vocab_size
    opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size

    print(opt)
    # opt.cuda = True
    device = torch.device('cuda' if opt.cuda else 'cpu')

    # TODO: Fill the code
    transformer = Transformer(d_word_embedding=opt.d_word_vec,
                              d_h=opt.d_model,
                              d_s=opt.d_model,
                              src_vocab_size=opt.src_vocab_size,
                              tgt_vocab_size=opt.tgt_vocab_size,
                              max_sent_len=opt.max_token_seq_len).to(device)

    optimizer = optim.Adam(filter(lambda x: x.requires_grad,
                                  transformer.parameters()),
                           betas=(0.9, 0.98),
                           eps=1e-09)

    train(transformer, training_data, validation_data, optimizer, device, opt)
Esempio n. 6
0
def main(gpu_id=None):
    dataset = Dataset(transform=transform, n_datas=10000)
    pad_vec = np.zeros(len(dataset.human_vocab))
    pad_vec[dataset.human_vocab['<pad>']] = 1
    dataloader = torch.utils.data.DataLoader(dataset=dataset,
                                             batch_size=6,
                                             shuffle=True,
                                             num_workers=6,
                                             collate_fn=partial(
                                                 collate_fn, pad_vec))

    model = Transformer(n_head=2)
    if gpu_id is not None:
        print('use gpu')
        os.environ["CUDA_VISIBLE_DEVICES"] = gpu_id
        n_gpus = torch.cuda.device_count()
        # print('use %d gpu [%s]' % (n_gpus, gpu_id))
        model = model.cuda()
        # model = torch.nn.DataParallel(model, device_ids=[i for i in range(n_gpus)])
    # loss_fn = torch.nn.CrossEntropyLoss()
    loss_fn = torch.nn.MSELoss()

    optimizer = torch.optim.Adam(model.parameters())

    model = sl.load_model('./checkpoint', -1, model)
    optimizer = sl.load_optimizer('./checkpoint', -1, optimizer)

    try:
        trained_epoch = sl.find_last_checkpoint('./checkpoint')
        print('train form epoch %d' % (trained_epoch + 1))
    except Exception as e:
        print('train from the very begining, {}'.format(e))
        trained_epoch = -1
    for epoch in range(trained_epoch + 1, 20):
        train(model,
              loss_fn,
              optimizer,
              dataloader,
              epoch,
              use_gpu=True if gpu_id is not None else False)
Esempio n. 7
0
def main(args):

    # 0. initial setting

    # set environmet
    cudnn.benchmark = True

    if not os.path.isdir('./ckpt'):
        os.mkdir('./ckpt')
    if not os.path.isdir('./results'):
        os.mkdir('./results')
    if not os.path.isdir(os.path.join('./ckpt', args.name)):
        os.mkdir(os.path.join('./ckpt', args.name))
    if not os.path.isdir(os.path.join('./results', args.name)):
        os.mkdir(os.path.join('./results', args.name))
    if not os.path.isdir(os.path.join('./results', args.name, "log")):
        os.mkdir(os.path.join('./results', args.name, "log"))

    # set logger
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(message)s')
    handler = logging.FileHandler("results/{}/log/{}.log".format(
        args.name, time.strftime('%c', time.localtime(time.time()))))
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    logger.addHandler(logging.StreamHandler())
    args.logger = logger

    # set cuda
    if torch.cuda.is_available():
        args.logger.info("running on cuda")
        args.device = torch.device("cuda")
        args.use_cuda = True
    else:
        args.logger.info("running on cpu")
        args.device = torch.device("cpu")
        args.use_cuda = False

    args.logger.info("[{}] starts".format(args.name))

    # 1. load data

    args.logger.info("loading data...")
    src, tgt = load_data(args.path)

    src_vocab = Vocab(init_token='<sos>',
                      eos_token='<eos>',
                      pad_token='<pad>',
                      unk_token='<unk>')
    src_vocab.load(os.path.join(args.path, 'vocab.en'))
    tgt_vocab = Vocab(init_token='<sos>',
                      eos_token='<eos>',
                      pad_token='<pad>',
                      unk_token='<unk>')
    tgt_vocab.load(os.path.join(args.path, 'vocab.de'))

    # 2. setup

    args.logger.info("setting up...")

    sos_idx = 0
    eos_idx = 1
    pad_idx = 2
    max_length = 50

    src_vocab_size = len(src_vocab)
    tgt_vocab_size = len(tgt_vocab)

    # transformer config
    d_e = 512  # embedding size
    d_q = 64  # query size (= key, value size)
    d_h = 2048  # hidden layer size in feed forward network
    num_heads = 8
    num_layers = 6  # number of encoder/decoder layers in encoder/decoder

    args.sos_idx = sos_idx
    args.eos_idx = eos_idx
    args.pad_idx = pad_idx
    args.max_length = max_length
    args.src_vocab_size = src_vocab_size
    args.tgt_vocab_size = tgt_vocab_size
    args.d_e = d_e
    args.d_q = d_q
    args.d_h = d_h
    args.num_heads = num_heads
    args.num_layers = num_layers

    model = Transformer(args)
    model.to(args.device)
    loss_fn = nn.CrossEntropyLoss(ignore_index=pad_idx)
    optimizer = optim.Adam(model.parameters(), lr=1e-5)

    if args.load:
        model.load_state_dict(load(args, args.ckpt))

    # 3. train / test

    if not args.test:
        # train
        args.logger.info("starting training")
        acc_val_meter = AverageMeter(name="Acc-Val (%)",
                                     save_all=True,
                                     save_dir=os.path.join(
                                         'results', args.name))
        train_loss_meter = AverageMeter(name="Loss",
                                        save_all=True,
                                        save_dir=os.path.join(
                                            'results', args.name))
        train_loader = get_loader(src['train'],
                                  tgt['train'],
                                  src_vocab,
                                  tgt_vocab,
                                  batch_size=args.batch_size,
                                  shuffle=True)
        valid_loader = get_loader(src['valid'],
                                  tgt['valid'],
                                  src_vocab,
                                  tgt_vocab,
                                  batch_size=args.batch_size)

        for epoch in range(1, 1 + args.epochs):
            spent_time = time.time()
            model.train()
            train_loss_tmp_meter = AverageMeter()
            for src_batch, tgt_batch in tqdm(train_loader):
                # src_batch: (batch x source_length), tgt_batch: (batch x target_length)
                optimizer.zero_grad()
                src_batch, tgt_batch = torch.LongTensor(src_batch).to(
                    args.device), torch.LongTensor(tgt_batch).to(args.device)
                batch = src_batch.shape[0]
                # split target batch into input and output
                tgt_batch_i = tgt_batch[:, :-1]
                tgt_batch_o = tgt_batch[:, 1:]

                pred = model(src_batch.to(args.device),
                             tgt_batch_i.to(args.device))
                loss = loss_fn(pred.contiguous().view(-1, tgt_vocab_size),
                               tgt_batch_o.contiguous().view(-1))
                loss.backward()
                optimizer.step()

                train_loss_tmp_meter.update(loss / batch, weight=batch)

            train_loss_meter.update(train_loss_tmp_meter.avg)
            spent_time = time.time() - spent_time
            args.logger.info(
                "[{}] train loss: {:.3f} took {:.1f} seconds".format(
                    epoch, train_loss_tmp_meter.avg, spent_time))

            # validation
            model.eval()
            acc_val_tmp_meter = AverageMeter()
            spent_time = time.time()

            for src_batch, tgt_batch in tqdm(valid_loader):
                src_batch, tgt_batch = torch.LongTensor(
                    src_batch), torch.LongTensor(tgt_batch)
                tgt_batch_i = tgt_batch[:, :-1]
                tgt_batch_o = tgt_batch[:, 1:]

                with torch.no_grad():
                    pred = model(src_batch.to(args.device),
                                 tgt_batch_i.to(args.device))

                corrects, total = val_check(
                    pred.max(dim=-1)[1].cpu(), tgt_batch_o)
                acc_val_tmp_meter.update(100 * corrects / total, total)

            spent_time = time.time() - spent_time
            args.logger.info(
                "[{}] validation accuracy: {:.1f} %, took {} seconds".format(
                    epoch, acc_val_tmp_meter.avg, spent_time))
            acc_val_meter.update(acc_val_tmp_meter.avg)

            if epoch % args.save_period == 0:
                save(args, "epoch_{}".format(epoch), model.state_dict())
                acc_val_meter.save()
                train_loss_meter.save()
    else:
        # test
        args.logger.info("starting test")
        test_loader = get_loader(src['test'],
                                 tgt['test'],
                                 src_vocab,
                                 tgt_vocab,
                                 batch_size=args.batch_size)
        pred_list = []
        model.eval()

        for src_batch, tgt_batch in test_loader:
            #src_batch: (batch x source_length)
            src_batch = torch.Tensor(src_batch).long().to(args.device)
            batch = src_batch.shape[0]
            pred_batch = torch.zeros(batch, 1).long().to(args.device)
            pred_mask = torch.zeros(batch, 1).bool().to(
                args.device)  # mask whether each sentece ended up

            with torch.no_grad():
                for _ in range(args.max_length):
                    pred = model(
                        src_batch,
                        pred_batch)  # (batch x length x tgt_vocab_size)
                    pred[:, :, pad_idx] = -1  # ignore <pad>
                    pred = pred.max(dim=-1)[1][:, -1].unsqueeze(
                        -1)  # next word prediction: (batch x 1)
                    pred = pred.masked_fill(
                        pred_mask,
                        2).long()  # fill out <pad> for ended sentences
                    pred_mask = torch.gt(pred.eq(1) + pred.eq(2), 0)
                    pred_batch = torch.cat([pred_batch, pred], dim=1)
                    if torch.prod(pred_mask) == 1:
                        break

            pred_batch = torch.cat([
                pred_batch,
                torch.ones(batch, 1).long().to(args.device) + pred_mask.long()
            ],
                                   dim=1)  # close all sentences
            pred_list += seq2sen(pred_batch.cpu().numpy().tolist(), tgt_vocab)

        with open('results/pred.txt', 'w', encoding='utf-8') as f:
            for line in pred_list:
                f.write('{}\n'.format(line))

        os.system(
            'bash scripts/bleu.sh results/pred.txt multi30k/test.de.atok')
Esempio n. 8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-data',
        type=str,
        default='./data/data.pt',
        help=
        'Path to the source data. The default is ./data/data.pt, which is the output of preprocessing.'
    )
    parser.add_argument('-epoch', default=10000)
    parser.add_argument('-log_step', default=5)
    parser.add_argument('-save_model_epoch', default=1)
    parser.add_argument('-save_model_path', default='./saved_model/')
    args = parser.parse_args()

    dataset = torch.load(args.data)

    batch_size = 4
    src_vocab = dataset['dict']['src']
    tgt_vocab = dataset['dict']['tgt']
    print("\n\nBatch Size = %d" % batch_size)
    print("Source Vocab Size = %d" % len(src_vocab))
    print("Target Vocab Size = %d" % len(tgt_vocab))

    print("\nLoading Training Data ... ")
    training_batches = get_loader(src=dataset['train']['src'],
                                  tgt=dataset['train']['tgt'],
                                  src_vocabs=dataset['dict']['src'],
                                  tgt_vocabs=dataset['dict']['tgt'],
                                  batch_size=batch_size,
                                  use_cuda=True,
                                  shuffle=True)

    # print("\nLoading Validation Data ... ")
    # validation_data = get_loader(
    #     src=dataset['valid']['src'],
    #     tgt=dataset['valid']['tgt'],
    #     src_vocabs=dataset['dict']['src'],
    #     tgt_vocabs=dataset['dict']['tgt'],
    #     batch_size=batch_size,
    #     use_cuda=False,
    #     shuffle=False
    # )

    # For python 2
    transformer_config = [
        6, 512, 512, 8, batch_size,
        len(src_vocab),
        len(tgt_vocab), 100, 0.1, True
    ]

    # For python 3
    # transformer_config = {
    #     'N': 6,
    #     'd_model': int(512),
    #     'd_ff': 512,
    #     'H': 8,
    #     'batch_size': batch_size,
    #     'src_vocab_size': int(len(src_vocab)),
    #     'tgt_vocab_size': int(len(tgt_vocab)),
    #     'max_seq': 100,
    #     'dropout': 0.1,
    #     'use_cuda': True
    # }

    transformer = Transformer(transformer_config)
    if torch.cuda.is_available():
        print("CUDA enabled.")
        transformer.cuda()

    optimizer = optim.Adam(
        transformer.parameters(),
        lr=0.001,
        # betas=(0.9, 0.98),
        # eps=1e-09
    )

    criterion = nn.CrossEntropyLoss()

    # Prepare a txt file to print training log
    if not os.path.exists(args.save_model_path):
        print(
            "\nCreated a directory (%s) for saving model since it does not exist.\n"
            % args.save_model_path)
        os.makedirs(args.save_model_path)

    f = open('%s/train_log.txt' % args.save_model_path, 'w')

    # Train the model
    for e in range(args.epoch):
        for i, batch in enumerate(
                tqdm(training_batches,
                     mininterval=2,
                     desc='  Training  ',
                     leave=False)):
            # print ("BATCH")
            # print(batch[0][0])
            # exit()
            sources = to_var(batch[0])
            targets = to_var(batch[1])
            src_seq_len = targets.size()[1]
            tgt_seq_len = targets.size()[1]

            if torch.cuda.is_available():
                sources = sources.cuda()
                targets = targets.cuda()

            optimizer.zero_grad()
            outputs = transformer(sources, targets)

            # print("\n\n\n########### OUTPUT ###########")
            # print(len(outputs))
            # print(outputs.max(1)[1].data.tolist() )
            # exit()
            #
            # print("\n\n\n########### TARGET ###########")
            # print(len(targets))
            # print(targets)

            # print(" \n\n TARGETS %d " %i)
            # print(targets)
            # print(targets.contiguous().view(-1).long())
            # exit()

            targets = targets.contiguous().view(-1).long()
            loss = criterion(outputs, targets)

            # backprop
            loss.backward()

            # optimize params
            optimizer.step()

            # Print log info to both console and file
            if i % args.log_step == 0:
                print(
                    "\n\n\n\n#################################################################################"
                )
                log = (
                    'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f\n'
                    % (e, args.epoch, i, len(training_batches), loss.data[0],
                       np.exp(loss.data[0])))
                print(log)
                f.write("{}".format(log))

                # Print the first sentence of the batch (The first sentence of the batch)
                src_indices = sources.data.tolist(
                )[0][:src_seq_len]  # Variable -> Tensor -> List
                src_sentence = convert2text(src_indices,
                                            src_vocab)  # Get sentence

                pred_indices = outputs.max(
                    1)[1].data.tolist()  # Variable -> Tensor -> List
                pred_indices = [
                    i[0] for i in pred_indices[:tgt_seq_len]
                ]  # Get data of index until the max_seq_length of target (i.e. first sentence of the batch).
                pred_sentence = convert2text(pred_indices,
                                             tgt_vocab)  # Get sentence

                tgt_indices = targets.data.tolist(
                )[:tgt_seq_len]  # Variable -> Tensor -> List
                tgt_sentence = convert2text(tgt_indices,
                                            tgt_vocab)  # Get sentence

                original = ("ORIGINAL:  {}\n".format(src_sentence))
                predicted = ("PREDICTED: {}\n".format(pred_sentence))
                truth = ("TRUTH:     {}\n\n".format(tgt_sentence))
                print(original)
                print(predicted)
                print(truth)
                f.write("{}".format(original))
                f.write("{}".format(predicted))
                f.write("{}".format(truth))

        # Save the models
        if (e) % args.save_model_epoch == 0:
            torch.save(
                transformer.state_dict(),
                os.path.join(args.save_model_path,
                             'transformer-%d-%d.pkl' % (e + 1, i + 1)))
def transformer(dataloader, EPOCH, k, frequency, path_to_save_model, path_to_save_loss, path_to_save_predictions, device):

    device = torch.device(device)

    model = Transformer().double().to(device)
    optimizer = torch.optim.Adam(model.parameters())
    # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=200)
    criterion = torch.nn.MSELoss()
    best_model = ""
    min_train_loss = float('inf')

    for epoch in range(EPOCH + 1):
        train_loss = 0
        val_loss = 0

        ## TRAIN -- TEACHER FORCING
        model.train()
        for index_in, index_tar, _input, target, sensor_number in dataloader:
        
            # Shape of _input : [batch, input_length, feature]
            # Desired input for model: [input_length, batch, feature]

            optimizer.zero_grad()
            src = _input.permute(1,0,2).double().to(device)[:-1,:,:] # torch.Size([24, 1, 7])
            target = _input.permute(1,0,2).double().to(device)[1:,:,:] # src shifted by 1.
            sampled_src = src[:1, :, :] #t0 torch.Size([1, 1, 7])

            for i in range(len(target)-1):

                prediction = model(sampled_src, device) # torch.Size([1xw, 1, 1])
                # for p1, p2 in zip(params, model.parameters()):
                #     if p1.data.ne(p2.data).sum() > 0:
                #         ic(False)
                # ic(True)
                # ic(i, sampled_src[:,:,0], prediction)
                # time.sleep(1)
                """
                # to update model at every step
                # loss = criterion(prediction, target[:i+1,:,:1])
                # loss.backward()
                # optimizer.step()
                """

                if i < 24: # One day, enough data to make inferences about cycles
                    prob_true_val = True
                else:
                    ## coin flip
                    v = k/(k+math.exp(epoch/k)) # probability of heads/tails depends on the epoch, evolves with time.
                    prob_true_val = flip_from_probability(v) # starts with over 95 % probability of true val for each flip in epoch 0.
                    ## if using true value as new value

                if prob_true_val: # Using true value as next value
                    sampled_src = torch.cat((sampled_src.detach(), src[i+1, :, :].unsqueeze(0).detach()))
                else: ## using prediction as new value
                    positional_encodings_new_val = src[i+1,:,1:].unsqueeze(0)
                    predicted_humidity = torch.cat((prediction[-1,:,:].unsqueeze(0), positional_encodings_new_val), dim=2)
                    sampled_src = torch.cat((sampled_src.detach(), predicted_humidity.detach()))
            
            """To update model after each sequence"""
            loss = criterion(target[:-1,:,0].unsqueeze(-1), prediction)
            loss.backward()
            optimizer.step()
            train_loss += loss.detach().item()

        if train_loss < min_train_loss:
            torch.save(model.state_dict(), path_to_save_model + f"best_train_{epoch}.pth")
            torch.save(optimizer.state_dict(), path_to_save_model + f"optimizer_{epoch}.pth")
            min_train_loss = train_loss
            best_model = f"best_train_{epoch}.pth"


        if epoch % 10 == 0: # Plot 1-Step Predictions

            logger.info(f"Epoch: {epoch}, Training loss: {train_loss}")
            scaler = load('scalar_item.joblib')
            sampled_src_humidity = scaler.inverse_transform(sampled_src[:,:,0].cpu()) #torch.Size([35, 1, 7])
            src_humidity = scaler.inverse_transform(src[:,:,0].cpu()) #torch.Size([35, 1, 7])
            target_humidity = scaler.inverse_transform(target[:,:,0].cpu()) #torch.Size([35, 1, 7])
            prediction_humidity = scaler.inverse_transform(prediction[:,:,0].detach().cpu().numpy()) #torch.Size([35, 1, 7])
            plot_training_3(epoch, path_to_save_predictions, src_humidity, sampled_src_humidity, prediction_humidity, sensor_number, index_in, index_tar)

        train_loss /= len(dataloader)
        log_loss(train_loss, path_to_save_loss, train=True)
        
    plot_loss(path_to_save_loss, train=True)
    return best_model
Esempio n. 10
0
def main(args):
    src, tgt = load_data(args.path)

    src_vocab = Vocab(init_token='<sos>',
                      eos_token='<eos>',
                      pad_token='<pad>',
                      unk_token='<unk>')
    src_vocab.load(os.path.join(args.path, 'vocab.en'))
    tgt_vocab = Vocab(init_token='<sos>',
                      eos_token='<eos>',
                      pad_token='<pad>',
                      unk_token='<unk>')
    tgt_vocab.load(os.path.join(args.path, 'vocab.de'))

    vsize_src = len(src_vocab)
    vsize_tar = len(tgt_vocab)
    net = Transformer(vsize_src, vsize_tar)

    if not args.test:

        train_loader = get_loader(src['train'],
                                  tgt['train'],
                                  src_vocab,
                                  tgt_vocab,
                                  batch_size=args.batch_size,
                                  shuffle=True)
        valid_loader = get_loader(src['valid'],
                                  tgt['valid'],
                                  src_vocab,
                                  tgt_vocab,
                                  batch_size=args.batch_size)

        net.to(device)
        optimizer = optim.Adam(net.parameters(), lr=args.lr)

        best_valid_loss = 10.0
        for epoch in range(args.epochs):
            print("Epoch {0}".format(epoch))
            net.train()
            train_loss = run_epoch(net, train_loader, optimizer)
            print("train loss: {0}".format(train_loss))
            net.eval()
            valid_loss = run_epoch(net, valid_loader, None)
            print("valid loss: {0}".format(valid_loss))
            torch.save(net, 'data/ckpt/last_model')
            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
                torch.save(net, 'data/ckpt/best_model')
    else:
        # test
        net = torch.load('data/ckpt/best_model')
        net.to(device)
        net.eval()

        test_loader = get_loader(src['test'],
                                 tgt['test'],
                                 src_vocab,
                                 tgt_vocab,
                                 batch_size=args.batch_size)

        pred = []
        iter_cnt = 0
        for src_batch, tgt_batch in test_loader:
            source, src_mask = make_tensor(src_batch)
            source = source.to(device)
            src_mask = src_mask.to(device)
            res = net.decode(source, src_mask)
            pred_batch = res.tolist()
            # every sentences in pred_batch should start with <sos> token (index: 0) and end with <eos> token (index: 1).
            # every <pad> token (index: 2) should be located after <eos> token (index: 1).
            # example of pred_batch:
            # [[0, 5, 6, 7, 1],
            #  [0, 4, 9, 1, 2],
            #  [0, 6, 1, 2, 2]]
            pred += seq2sen(pred_batch, tgt_vocab)
            iter_cnt += 1
            #print(pred_batch)

        with open('data/results/pred.txt', 'w') as f:
            for line in pred:
                f.write('{}\n'.format(line))

        os.system(
            'bash scripts/bleu.sh data/results/pred.txt data/multi30k/test.de.atok'
        )
Esempio n. 11
0
        model = Transformer(device=device,
                            d_feature=train_data.sig_len,
                            d_model=d_model,
                            d_inner=d_inner,
                            n_layers=num_layers,
                            n_head=num_heads,
                            d_k=64,
                            d_v=64,
                            dropout=dropout,
                            class_num=class_num)

        model = model.to(device)

        optimizer = ScheduledOptim(
            Adam(filter(lambda x: x.requires_grad, model.parameters()),
                 betas=(0.9, 0.98),
                 eps=1e-09), d_model, warm_steps)
        train_accs = []
        valid_accs = []
        eva_indis = []
        train_losses = []
        valid_losses = []
        for epoch_i in range(epoch):
            print('[ Epoch', epoch_i, ']')
            start = time.time()
            train_loss, train_acc, cnt = train_epoch(train_loader, device,
                                                     model, optimizer,
                                                     train_data.__len__())
            print(
                '  - (Training)  loss: {loss: 8.5f}, accuracy: {accu:3.3f} %, '
Esempio n. 12
0
train_x, train_y, test_x, test_y = build_data(series, min_len = 3, max_len = max_len)
max_index = int(max(train_x.max(), test_x.max()))

args = {
    'emb_dim':        32,            # Embedding vector dimension
    'n_att_heads':    16,            # Number of attention heads for each transformer block
    'n_transformers': 4,             # Depth of the network (nr. of self-attention layers)
    'seq_length':     max_len,       # Sequence length
    'num_tokens':     max_index + 1, # Vocabulary size (highest index found in dataset)
    'device':         device,        # Device: cuda/cpu
    'wide':           False          # Narrow or wide self-attention
}

stats = { 'loss': [], 'perplexity': [] } # we accomulate and save training statistics here
model = Transformer(**args).to(device)
opt   = torch.optim.Adam(lr=learning_rate, params=model.parameters())

for i in range(epochs):
    model.train()
    opt.zero_grad()
    
    # Sample a random batch of size `batch_size` from the train dataset
    idxs = torch.randint(size=(batch_size,), low=0, high=len(train_x))
    
    output, (emb_mean, emb_max) = model(train_x[idxs])
    loss = F.nll_loss(output, train_y[idxs], reduction='mean')
    nn.utils.clip_grad_norm_(model.parameters(), 1)
    loss.backward()
    opt.step()
    
    # Calculate perplexity on the test-set
Esempio n. 13
0
def main():
    """Entry point.
    """
    if torch.cuda.is_available():
        device = torch.device(torch.cuda.current_device())
        print(f"Using CUDA device {device}")
    else:
        device = None

    # Load data
    vocab = Vocab(config_data.vocab_file)
    data_hparams = {
        # "batch_size" is ignored for train since we use dynamic batching
        "batch_size": config_data.test_batch_size,
        "bos_id": vocab.bos_token_id,
        "eos_id": vocab.eos_token_id,
    }
    datasets = {
        split: data_utils.Seq2SeqData(os.path.join(
            config_data.input_dir,
            f"{config_data.filename_prefix}{split}.npy"),
                                      hparams=data_hparams,
                                      device=device)
        for split in ["train", "valid", "test"]
    }
    print(f"Training data size: {len(datasets['train'])}")
    beam_width = config_model.beam_width

    # Create logging
    tx.utils.maybe_create_dir(args.output_dir)
    logging_file = os.path.join(args.output_dir, "logging.txt")
    logger = utils.get_logger(logging_file)
    print(f"logging file is saved in: {logging_file}")

    # Create model and optimizer
    model = Transformer(config_model, config_data, vocab).to(device)

    best_results = {"score": 0, "epoch": -1}
    lr_config = config_model.lr_config
    if lr_config["learning_rate_schedule"] == "static":
        init_lr = lr_config["static_lr"]
        scheduler_lambda = lambda x: 1.0
    else:
        init_lr = lr_config["lr_constant"]
        scheduler_lambda = functools.partial(
            utils.get_lr_multiplier, warmup_steps=lr_config["warmup_steps"])
    optim = torch.optim.Adam(model.parameters(),
                             lr=init_lr,
                             betas=(0.9, 0.997),
                             eps=1e-9)
    scheduler = torch.optim.lr_scheduler.LambdaLR(optim, scheduler_lambda)

    @torch.no_grad()
    def _eval_epoch(epoch, mode, print_fn=None):
        if print_fn is None:
            print_fn = print
            tqdm_leave = True
        else:
            tqdm_leave = False
        model.eval()
        eval_data = datasets[mode]
        eval_iter = tx.data.DataIterator(eval_data)
        references, hypotheses = [], []
        for batch in tqdm.tqdm(eval_iter,
                               ncols=80,
                               leave=tqdm_leave,
                               desc=f"Eval on {mode} set"):
            predictions = model(
                encoder_input=batch.source,
                beam_width=beam_width,
            )
            if beam_width == 1:
                decoded_ids = predictions[0].sample_id
            else:
                decoded_ids = predictions["sample_id"][:, :, 0]

            hypotheses.extend(h.tolist() for h in decoded_ids)
            references.extend(r.tolist() for r in batch.target_output)
        hypotheses = utils.list_strip_eos(hypotheses, vocab.eos_token_id)
        references = utils.list_strip_eos(references, vocab.eos_token_id)

        if mode == "valid":
            # Writes results to files to evaluate BLEU
            # For 'eval' mode, the BLEU is based on token ids (rather than
            # text tokens) and serves only as a surrogate metric to monitor
            # the training process
            fname = os.path.join(args.output_dir, "tmp.eval")
            hwords, rwords = [], []
            for hyp, ref in zip(hypotheses, references):
                hwords.append([str(y) for y in hyp])
                rwords.append([str(y) for y in ref])
            hwords = tx.utils.str_join(hwords)
            rwords = tx.utils.str_join(rwords)
            hyp_file, ref_file = tx.utils.write_paired_text(
                hwords,
                rwords,
                fname,
                mode="s",
                src_fname_suffix="hyp",
                tgt_fname_suffix="ref",
            )
            eval_bleu = tx.evals.file_bleu(ref_file,
                                           hyp_file,
                                           case_sensitive=True)
            logger.info("epoch: %d, eval_bleu %.4f", epoch, eval_bleu)
            print_fn(f"epoch: {epoch:d}, eval_bleu {eval_bleu:.4f}")

            if eval_bleu > best_results["score"]:
                logger.info("epoch: %d, best bleu: %.4f", epoch, eval_bleu)
                best_results["score"] = eval_bleu
                best_results["epoch"] = epoch
                model_path = os.path.join(args.output_dir,
                                          args.output_filename)
                logger.info("Saving model to %s", model_path)
                print_fn(f"Saving model to {model_path}")

                states = {
                    "model": model.state_dict(),
                    "optimizer": optim.state_dict(),
                    "scheduler": scheduler.state_dict(),
                }
                torch.save(states, model_path)

        elif mode == "test":
            # For 'test' mode, together with the commands in README.md, BLEU
            # is evaluated based on text tokens, which is the standard metric.
            fname = os.path.join(args.output_dir, "test.output")
            hwords, rwords = [], []
            for hyp, ref in zip(hypotheses, references):
                hwords.append(vocab.map_ids_to_tokens_py(hyp))
                rwords.append(vocab.map_ids_to_tokens_py(ref))
            hwords = tx.utils.str_join(hwords)
            rwords = tx.utils.str_join(rwords)
            hyp_file, ref_file = tx.utils.write_paired_text(
                hwords,
                rwords,
                fname,
                mode="s",
                src_fname_suffix="hyp",
                tgt_fname_suffix="ref",
            )
            logger.info("Test output written to file: %s", hyp_file)
            print_fn(f"Test output written to file: {hyp_file}")

    def _train_epoch(epoch: int):
        model.train()
        train_iter = tx.data.DataIterator(
            datasets["train"],
            data_utils.CustomBatchingStrategy(config_data.max_batch_tokens))

        progress = tqdm.tqdm(
            train_iter,
            ncols=80,
            desc=f"Training epoch {epoch}",
        )
        for train_batch in progress:
            optim.zero_grad()
            loss = model(
                encoder_input=train_batch.source,
                decoder_input=train_batch.target_input,
                labels=train_batch.target_output,
            )
            loss.backward()

            optim.step()
            scheduler.step()

            step = scheduler.last_epoch
            if step % config_data.display_steps == 0:
                logger.info("step: %d, loss: %.4f", step, loss)
                lr = optim.param_groups[0]["lr"]
                progress.write(f"lr: {lr:.4e} step: {step}, loss: {loss:.4}")
            if step and step % config_data.eval_steps == 0:
                _eval_epoch(epoch, mode="valid", print_fn=progress.write)
        progress.close()

    model_path = os.path.join(args.output_dir, args.output_filename)

    if args.run_mode == "train_and_evaluate":
        logger.info("Begin running with train_and_evaluate mode")
        if os.path.exists(model_path):
            logger.info("Restore latest checkpoint in %s", model_path)
            ckpt = torch.load(model_path)
            model.load_state_dict(ckpt["model"])
            optim.load_state_dict(ckpt["optimizer"])
            scheduler.load_state_dict(ckpt["scheduler"])
            _eval_epoch(0, mode="valid")

        for epoch in range(config_data.max_train_epoch):
            _train_epoch(epoch)
            _eval_epoch(epoch, mode="valid")

    elif args.run_mode in ["evaluate", "test"]:
        logger.info("Begin running with %s mode", args.run_mode)
        logger.info("Restore latest checkpoint in %s", model_path)
        ckpt = torch.load(model_path)
        model.load_state_dict(ckpt["model"])
        _eval_epoch(0, mode=("test" if args.run_mode == "test" else "valid"))

    else:
        raise ValueError(f"Unknown mode: {args.run_mode}")
Esempio n. 14
0
# Getting the vocabulary size for the embedding matrix
vocab_size = len(vocab_dict)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Setting up the transformer
transformer = Transformer(d_model=config.d_model,
                          heads=config.heads,
                          num_layers=config.num_layers,
                          vocab_size=vocab_size)

## Sending the transformer to device
transformer = transformer.to(device)

## Hack no. 1 setting the parameters of layer to xavier_uniform
for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

## Want to train the loaded model
# checkpoint = torch.load('checkpoint.pth.tar')
# transformer = checkpoint['transformer']

## Hack no. 2 Got from pytorch transformer implementation
lr = 5.0  # learning rate
optimizer = torch.optim.SGD(transformer.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

for epoch in range(config.epochs):

    tot_loss = 0
def main(TEXT, LABEL, train_loader, test_loader):

    # for sentiment analysis. load .pt file
    from KoBERT.Bert_model import BERTClassifier
    from kobert.pytorch_kobert import get_pytorch_kobert_model
    bertmodel, vocab = get_pytorch_kobert_model()
    sa_model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)
    sa_model.load_state_dict(torch.load('bert_SA-model.pt'))

    # print argparse
    for idx, (key, value) in enumerate(args.__dict__.items()):
        if idx == 0:
            print("\nargparse{\n", "\t", key, ":", value)
        elif idx == len(args.__dict__) - 1:
            print("\t", key, ":", value, "\n}")
        else:
            print("\t", key, ":", value)

    from model import Transformer, GradualWarmupScheduler

    # Transformer model init
    model = Transformer(args, TEXT, LABEL)
    if args.per_soft:
        sorted_path = 'sorted_model-soft.pth'
    else:
        sorted_path = 'sorted_model-rough.pth'

    # loss 계산시 pad 제외.
    criterion = nn.CrossEntropyLoss(ignore_index=LABEL.vocab.stoi['<pad>'])

    optimizer = torch.optim.Adam(params=model.parameters(), lr=args.lr)
    scheduler = GradualWarmupScheduler(optimizer,
                                       multiplier=8,
                                       total_epoch=args.num_epochs)

    # pre-trained 된 vectors load
    model.src_embedding.weight.data.copy_(TEXT.vocab.vectors)
    model.trg_embedding.weight.data.copy_(LABEL.vocab.vectors)
    model.to(device)
    criterion.to(device)

    # overfitting 막기
    best_valid_loss = float('inf')

    # train
    if args.train:
        for epoch in range(args.num_epochs):
            torch.manual_seed(SEED)
            scheduler.step(epoch)
            start_time = time.time()

            # train, validation
            train_loss, train_acc = train(model, train_loader, optimizer,
                                          criterion)
            valid_loss, valid_acc = test(model, test_loader, criterion)

            # time cal
            end_time = time.time()
            epoch_mins, epoch_secs = epoch_time(start_time, end_time)

            #torch.save(model.state_dict(), sorted_path) # for some overfitting
            #전에 학습된 loss 보다 현재 loss 가 더 낮을시 모델 저장.
            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
                torch.save(
                    {
                        'epoch': epoch,
                        'model_state_dict': model.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        'loss': valid_loss
                    }, sorted_path)
                print(
                    f'\t## SAVE valid_loss: {valid_loss:.3f} | valid_acc: {valid_acc:.3f} ##'
                )

            # print loss and acc
            print(
                f'\n\t==Epoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s=='
            )
            print(
                f'\t==Train Loss: {train_loss:.3f} | Train_acc: {train_acc:.3f}=='
            )
            print(
                f'\t==Valid Loss: {valid_loss:.3f} | Valid_acc: {valid_acc:.3f}==\n'
            )

    # inference
    print("\t----------성능평가----------")
    checkpoint = torch.load(sorted_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    test_loss, test_acc = test(model, test_loader, criterion)  # 아
    print(f'==test_loss : {test_loss:.3f} | test_acc: {test_acc:.3f}==')
    print("\t-----------------------------")
    while (True):
        inference(device, args, TEXT, LABEL, model, sa_model)
        print("\n")
Esempio n. 16
0
def ed_train(train_iter, val_iter, TEXT, LABEL):
    global D_MODEL, N_LAYERS, N_HEADS, DROPOUT, N_EPOCHS, LR
    SRC_V_SIZE = len(TEXT.vocab)
    TGT_V_SIZE = len(LABEL.vocab)

    model = Transformer(SRC_V_SIZE,
                        TGT_V_SIZE,
                        D_MODEL,
                        N_LAYERS,
                        N_HEADS,
                        dropout=DROPOUT).to(device)
    optim = torch.optim.SGD(model.parameters(), lr=LR)
    scheduler = torch.optim.lr_scheduler.StepLR(optim, step_size=1, gamma=0.9)

    criterion = nn.CrossEntropyLoss()

    print(
        f'Encoder/Decoder Model Hyperparameters\n---------------------\nModel Hidden Dimension: {D_MODEL}'
        f'\nNum Layers: {N_LAYERS}\nNum Attention Heads: {N_HEADS}\nDropout: {DROPOUT}'
        f'\nLearning Rate: {LR}\nNum Epochs: {N_EPOCHS}\nBatch Size: {B_SIZE}'
        f'\nSource Vocab Size: {SRC_V_SIZE}\nTarget Vocab Size: {TGT_V_SIZE}\n'
    )

    loss_interval = 128
    loss_values = []
    val_loss_values = []
    val_acc_values = []

    model.train()
    for epoch in range(1, N_EPOCHS + 1):
        running_loss = 0.
        loss_values_sum = 0.
        print(f'Epoch {epoch}/{N_EPOCHS}')

        for b_num, batch in enumerate(train_iter):
            torch.cuda.empty_cache()
            start_time = time.time()
            true_batch_num = (len(train_iter) * epoch - 1) + b_num

            src_input, tgt_input, row_src, row_tgt = parse_batch(batch)
            if epoch == 1 and b_num == 0:
                print('src_input shape:', src_input.shape)
                print('tgt_input shape:', tgt_input.shape)
                print('row_src:', row_src.shape)
                print('row_tgt:', row_tgt.shape)

            SRC_SEQ_LEN = row_src.size(-1)
            TGT_SEQ_LEN = row_tgt.size(-1)

            src_mask, src_key_padding_mask, memory_key_padding_mask = create_src_masks(
                row_src, SRC_SEQ_LEN, TEXT, use_srcmask=args.srcmask)
            tgt_mask, tgt_key_padding_mask = create_tgt_masks(
                row_tgt, TGT_SEQ_LEN, LABEL)

            output = model(src_input,
                           tgt_input,
                           src_mask=src_mask,
                           tgt_mask=tgt_mask,
                           src_key_padding_mask=src_key_padding_mask,
                           tgt_key_padding_mask=tgt_key_padding_mask,
                           memory_key_padding_mask=memory_key_padding_mask)

            loss = criterion(output.view(-1, TGT_V_SIZE),
                             row_tgt.contiguous().view(-1))
            optim.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(),
                                     0.5)  # prevent exploding gradient
            optim.step()

            loss_values_sum += loss.item()
            running_loss += loss.item()
            el_time = time.time() - start_time

            if b_num % loss_interval == 0 and b_num > 0:
                loss_values.append(
                    (true_batch_num, loss_values_sum / loss_interval))
                loss_values_sum = 0.

            if b_num % 128 == 0:
                print(f'\tBatch {b_num}/{len(train_iter)} | secs/batch: '
                      f'{round(el_time, 4)} | loss: {loss} | '
                      f'lr: {scheduler.get_last_lr()}')

            if b_num % (len(train_iter) // 5) == 0 and b_num > 0:
                val_loss, val_acc = ed_evaluate(model, val_iter, TEXT, LABEL)
                model.train()

                val_loss_values.append((true_batch_num, val_loss))
                val_acc_values.append((true_batch_num, val_acc))

                if len(val_loss_values) > 1:
                    plt.plot(*zip(*loss_values), label='Train Loss')
                    plt.plot(*zip(*val_loss_values), label='Validation Loss')
                    plt.xlabel('Batch')
                    plt.ylabel('Loss')
                    plt.legend()
                    plt.show()

                if len(val_acc_values) > 1:
                    plt.plot(*zip(*val_acc_values),
                             label='Validation Accuracy')
                    plt.xlabel('Batch')
                    plt.ylabel('Accuracy')
                    plt.ylim(0, 1)
                    plt.legend()
                    plt.show()

        scheduler.step()
        print(f'Epoch {epoch}/{N_EPOCHS} | loss: {running_loss}')
        if epoch != N_EPOCHS:
            save_path = f'{args.savepath}train{epoch}.pth'
            torch.save(model.state_dict(), save_path)
            if epoch > 1:  # save the previous model
                save_path = f'{args.savepath}train{epoch-1}.pth'
                try:
                    files.download(save_path)
                except:
                    print(f'Unable to download {save_path}')

    print(f'Expected output shape {row_tgt.shape}\nTargets:{row_tgt}')
    print(
        f'output raw shape: {output.shape}\nargmax:\n{format_preds(output, TGT_SEQ_LEN)}'
    )

    save_path = f'{args.savepath}goldtrain.pth'
    torch.save(model.state_dict(), save_path)
Esempio n. 17
0
                                       batch_size=BATCH_SIZE,
                                       shuffle=True,
                                       sort=False)
test_iter = Iterator(test_data,
                     batch_size=BATCH_SIZE,
                     shuffle=False,
                     sort=False)

SRC_PAD_IDX = SRC.vocab.stoi['<pad>']
TRG_PAD_IDX = TRG.vocab.stoi['<pad>']

model = Transformer(len(SRC.vocab), len(TRG.vocab), MAX_LEN, MODEL_SIZE,
                    FF_SIZE, KEY_SIZE, VALUE_SIZE, NUM_HEADS, NUM_LAYERS,
                    DROPOUT, SRC_PAD_IDX, TRG_PAD_IDX).to(DEVICE)
criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)
opt = AdamWrapper(model.parameters(), MODEL_SIZE, WARMUP)

if args.train or args.continue_training:
    if args.train:
        best_val_loss = float('inf')
        with open(LOG_PATH, 'w') as f:
            f.write('')
    else:
        model.load_state_dict(torch.load(MODEL_PATH))
        with open(LOG_PATH, 'r') as f:
            val_losses = [float(line.split()[-1]) for line in f]
            best_val_loss = min(val_losses)

    print(f'best_val_loss: {best_val_loss}')

    for epoch in range(NUM_EPOCHS):
Esempio n. 18
0
                        shuffle=True)

    model = Transformer()

    # 指定多gpu运行
    if torch.cuda.is_available():
        model.cuda()

    if torch.cuda.device_count() > 1:
        args.n_gpu = torch.cuda.device_count()
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        # 就这一行
        model = nn.DataParallel(model)

    criterion = nn.CrossEntropyLoss(ignore_index=0)
    optimizer = optim.SGD(model.parameters(), lr=1e-3, momentum=0.99)

    for epoch in range(30):
        # 训练三十轮
        for enc_inputs, dec_inputs, dec_outputs in loader:
            '''
            enc_inputs: [batch_size, src_len]
            dec_inputs: [batch_size, tgt_len]
            dec_outputs: [batch_size, tgt_len]
            '''
            if torch.cuda.is_available():
                enc_inputs, dec_inputs, dec_outputs = enc_inputs.cuda(
                ), dec_inputs.cuda(), dec_outputs.cuda()
            # outputs: [batch_size * tgt_len, tgt_vocab_size]
            outputs, enc_self_attns, dec_self_attns, dec_enc_attns = model(
                enc_inputs, dec_inputs)
Esempio n. 19
0
model = Transformer(
    args.embedding_size,
    args.src_vocab_size,
    args.trg_vocab_size,
    src_pad_idx,
    args.num_heads,
    args.num_encoder_layers,
    args.num_decoder_layers,
    args.forward_expansion,
    args.dropout,
    args.max_len,
    device,
).to(device)

optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                       factor=0.1,
                                                       patience=10,
                                                       verbose=True)

pad_idx = english.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

# Trainer module
trainer = Trainer(model=model,
                  device=device,
                  loss_fn=criterion,
                  optimizer=optimizer,
                  scheduler=None)
Esempio n. 20
0
def do_train(args):
    if args.use_cuda:
        trainer_count = fluid.dygraph.parallel.Env().nranks
        place = fluid.CUDAPlace(fluid.dygraph.parallel.Env(
        ).dev_id) if trainer_count > 1 else fluid.CUDAPlace(0)
    else:
        trainer_count = 1
        place = fluid.CPUPlace()

    # define the data generator
    processor = reader.DataProcessor(
        fpattern=args.training_file,
        src_vocab_fpath=args.src_vocab_fpath,
        trg_vocab_fpath=args.trg_vocab_fpath,
        token_delimiter=args.token_delimiter,
        use_token_batch=args.use_token_batch,
        batch_size=args.batch_size,
        device_count=trainer_count,
        pool_size=args.pool_size,
        sort_type=args.sort_type,
        shuffle=args.shuffle,
        shuffle_batch=args.shuffle_batch,
        start_mark=args.special_token[0],
        end_mark=args.special_token[1],
        unk_mark=args.special_token[2],
        max_length=args.max_length,
        n_head=args.n_head)
    batch_generator = processor.data_generator(phase="train")
    if args.validation_file:
        val_processor = reader.DataProcessor(
            fpattern=args.validation_file,
            src_vocab_fpath=args.src_vocab_fpath,
            trg_vocab_fpath=args.trg_vocab_fpath,
            token_delimiter=args.token_delimiter,
            use_token_batch=args.use_token_batch,
            batch_size=args.batch_size,
            device_count=trainer_count,
            pool_size=args.pool_size,
            sort_type=args.sort_type,
            shuffle=False,
            shuffle_batch=False,
            start_mark=args.special_token[0],
            end_mark=args.special_token[1],
            unk_mark=args.special_token[2],
            max_length=args.max_length,
            n_head=args.n_head)
        val_batch_generator = val_processor.data_generator(phase="train")
    if trainer_count > 1:  # for multi-process gpu training
        batch_generator = fluid.contrib.reader.distributed_batch_reader(
            batch_generator)
    args.src_vocab_size, args.trg_vocab_size, args.bos_idx, args.eos_idx, \
        args.unk_idx = processor.get_vocab_summary()

    with fluid.dygraph.guard(place):
        # set seed for CE
        random_seed = eval(str(args.random_seed))
        if random_seed is not None:
            fluid.default_main_program().random_seed = random_seed
            fluid.default_startup_program().random_seed = random_seed

        # define data loader
        train_loader = fluid.io.DataLoader.from_generator(capacity=10)
        train_loader.set_batch_generator(batch_generator, places=place)
        if args.validation_file:
            val_loader = fluid.io.DataLoader.from_generator(capacity=10)
            val_loader.set_batch_generator(val_batch_generator, places=place)

        # define model
        transformer = Transformer(
            args.src_vocab_size, args.trg_vocab_size, args.max_length + 1,
            args.n_layer, args.n_head, args.d_key, args.d_value, args.d_model,
            args.d_inner_hid, args.prepostprocess_dropout,
            args.attention_dropout, args.relu_dropout, args.preprocess_cmd,
            args.postprocess_cmd, args.weight_sharing, args.bos_idx,
            args.eos_idx)

        # define loss
        criterion = CrossEntropyCriterion(args.label_smooth_eps)

        # define optimizer
        optimizer = fluid.optimizer.Adam(
            learning_rate=NoamDecay(args.d_model, args.warmup_steps,
                                    args.learning_rate),
            beta1=args.beta1,
            beta2=args.beta2,
            epsilon=float(args.eps),
            parameter_list=transformer.parameters())

        ## init from some checkpoint, to resume the previous training
        if args.init_from_checkpoint:
            model_dict, opt_dict = fluid.load_dygraph(
                os.path.join(args.init_from_checkpoint, "transformer"))
            transformer.load_dict(model_dict)
            optimizer.set_dict(opt_dict)
        ## init from some pretrain models, to better solve the current task
        if args.init_from_pretrain_model:
            model_dict, _ = fluid.load_dygraph(
                os.path.join(args.init_from_pretrain_model, "transformer"))
            transformer.load_dict(model_dict)

        if trainer_count > 1:
            strategy = fluid.dygraph.parallel.prepare_context()
            transformer = fluid.dygraph.parallel.DataParallel(transformer,
                                                              strategy)

        # the best cross-entropy value with label smoothing
        loss_normalizer = -(
            (1. - args.label_smooth_eps) * np.log(
                (1. - args.label_smooth_eps)) + args.label_smooth_eps *
            np.log(args.label_smooth_eps / (args.trg_vocab_size - 1) + 1e-20))

        ce_time = []
        ce_ppl = []
        step_idx = 0

        # train loop
        for pass_id in range(args.epoch):
            epoch_start = time.time()

            batch_id = 0
            batch_start = time.time()
            interval_word_num = 0.0
            for input_data in train_loader():
                if args.max_iter and step_idx == args.max_iter:  #NOTE: used for benchmark
                    return
                batch_reader_end = time.time()

                (src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos,
                 trg_slf_attn_bias, trg_src_attn_bias, lbl_word,
                 lbl_weight) = input_data

                logits = transformer(src_word, src_pos, src_slf_attn_bias,
                                     trg_word, trg_pos, trg_slf_attn_bias,
                                     trg_src_attn_bias)

                sum_cost, avg_cost, token_num = criterion(logits, lbl_word,
                                                          lbl_weight)

                if trainer_count > 1:
                    avg_cost = transformer.scale_loss(avg_cost)
                    avg_cost.backward()
                    transformer.apply_collective_grads()
                else:
                    avg_cost.backward()

                optimizer.minimize(avg_cost)
                transformer.clear_gradients()

                interval_word_num += np.prod(src_word.shape)
                if step_idx % args.print_step == 0:
                    total_avg_cost = avg_cost.numpy() * trainer_count

                    if step_idx == 0:
                        logger.info(
                            "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, "
                            "normalized loss: %f, ppl: %f" %
                            (step_idx, pass_id, batch_id, total_avg_cost,
                             total_avg_cost - loss_normalizer,
                             np.exp([min(total_avg_cost, 100)])))
                    else:
                        train_avg_batch_cost = args.print_step / (
                            time.time() - batch_start)
                        word_speed = interval_word_num / (
                            time.time() - batch_start)
                        logger.info(
                            "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, "
                            "normalized loss: %f, ppl: %f, avg_speed: %.2f step/s, "
                            "words speed: %0.2f words/s" %
                            (step_idx, pass_id, batch_id, total_avg_cost,
                             total_avg_cost - loss_normalizer,
                             np.exp([min(total_avg_cost, 100)]),
                             train_avg_batch_cost, word_speed))
                    batch_start = time.time()
                    interval_word_num = 0.0

                if step_idx % args.save_step == 0 and step_idx != 0:
                    # validation
                    if args.validation_file:
                        transformer.eval()
                        total_sum_cost = 0
                        total_token_num = 0
                        for input_data in val_loader():
                            (src_word, src_pos, src_slf_attn_bias, trg_word,
                             trg_pos, trg_slf_attn_bias, trg_src_attn_bias,
                             lbl_word, lbl_weight) = input_data
                            logits = transformer(
                                src_word, src_pos, src_slf_attn_bias, trg_word,
                                trg_pos, trg_slf_attn_bias, trg_src_attn_bias)
                            sum_cost, avg_cost, token_num = criterion(
                                logits, lbl_word, lbl_weight)
                            total_sum_cost += sum_cost.numpy()
                            total_token_num += token_num.numpy()
                            total_avg_cost = total_sum_cost / total_token_num
                        logger.info("validation, step_idx: %d, avg loss: %f, "
                                    "normalized loss: %f, ppl: %f" %
                                    (step_idx, total_avg_cost,
                                     total_avg_cost - loss_normalizer,
                                     np.exp([min(total_avg_cost, 100)])))
                        transformer.train()

                    if args.save_model and (
                            trainer_count == 1 or
                            fluid.dygraph.parallel.Env().dev_id == 0):
                        model_dir = os.path.join(args.save_model,
                                                 "step_" + str(step_idx))
                        if not os.path.exists(model_dir):
                            os.makedirs(model_dir)
                        fluid.save_dygraph(
                            transformer.state_dict(),
                            os.path.join(model_dir, "transformer"))
                        fluid.save_dygraph(
                            optimizer.state_dict(),
                            os.path.join(model_dir, "transformer"))

                batch_id += 1
                step_idx += 1

            train_epoch_cost = time.time() - epoch_start
            ce_time.append(train_epoch_cost)
            logger.info("train epoch: %d, epoch_cost: %.5f s" %
                        (pass_id, train_epoch_cost))

        if args.save_model:
            model_dir = os.path.join(args.save_model, "step_final")
            if not os.path.exists(model_dir):
                os.makedirs(model_dir)
            fluid.save_dygraph(transformer.state_dict(),
                               os.path.join(model_dir, "transformer"))
            fluid.save_dygraph(optimizer.state_dict(),
                               os.path.join(model_dir, "transformer"))

        if args.enable_ce:
            _ppl = 0
            _time = 0
            try:
                _time = ce_time[-1]
                _ppl = ce_ppl[-1]
            except:
                print("ce info error")
            print("kpis\ttrain_duration_card%s\t%s" % (trainer_count, _time))
            print("kpis\ttrain_ppl_card%s\t%f" % (trainer_count, _ppl))
Esempio n. 21
0
class Trainer:
    def __init__(self, args, train_loader, test_loader, tokenizer_src, tokenizer_tgt):
        self.args = args
        self.train_loader = train_loader
        self.test_loader = test_loader
        self.src_vocab_size = tokenizer_src.vocab_size
        self.tgt_vocab_size = tokenizer_tgt.vocab_size
        self.pad_id = tokenizer_src.pad_token_id # pad_token_id in tokenizer_tgt.vocab should be the same with this.
        self.device = 'cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu'

        self.model = Transformer(src_vocab_size = self.src_vocab_size,
                                 tgt_vocab_size = self.tgt_vocab_size,
                                 seq_len        = args.max_seq_len,
                                 d_model        = args.hidden,
                                 n_layers       = args.n_layers,
                                 n_heads        = args.n_attn_heads,
                                 p_drop         = args.dropout,
                                 d_ff           = args.ffn_hidden,
                                 pad_id         = self.pad_id)
        if args.multi_gpu:
            self.model = nn.DataParallel(self.model)
        self.model.to(self.device)

        self.optimizer = ScheduledOptim(optim.Adam(self.model.parameters(), betas=(0.9, 0.98), eps=1e-9),
                                        init_lr=2.0, d_model=args.hidden)
        self.criterion = nn.CrossEntropyLoss(ignore_index=self.pad_id)

    def train(self, epoch):
        losses = 0
        n_batches, n_samples = len(self.train_loader), len(self.train_loader.dataset)
        
        self.model.train()
        for i, batch in enumerate(self.train_loader):
            encoder_inputs, decoder_inputs, decoder_outputs = map(lambda x: x.to(self.device), batch)
            # |encoder_inputs| : (batch_size, seq_len), |decoder_inputs| : (batch_size, seq_len-1), |decoder_outputs| : (batch_size, seq_len-1)

            outputs, encoder_attns, decoder_attns, enc_dec_attns = self.model(encoder_inputs, decoder_inputs)
            # |outputs| : (batch_size, seq_len-1, tgt_vocab_size)
            # |encoder_attns| : [(batch_size, n_heads, seq_len, seq_len)] * n_layers
            # |decoder_attns| : [(batch_size, n_heads, seq_len-1, seq_len-1)] * n_layers
            # |enc_dec_attns| : [(batch_size, n_heads, seq_len-1, seq_len)] * n_layers
            
            loss = self.criterion(outputs.view(-1, self.tgt_vocab_size), decoder_outputs.view(-1))
            losses += loss.item()
            
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.update_learning_rate()
            self.optimizer.step()

            if i % (n_batches//5) == 0 and i != 0:
                print('Iteration {} ({}/{})\tLoss: {:.4f}\tlr: {:.4f}'.format(i, i, n_batches, losses/i, self.optimizer.get_current_lr))
        
        print('Train Epoch: {}\t>\tLoss: {:.4f}'.format(epoch, losses/n_batches))
            
    def validate(self, epoch):
        losses = 0
        n_batches, n_samples = len(self.test_loader), len(self.test_loader.dataset)
        
        self.model.eval()
        with torch.no_grad():
            for i, batch in enumerate(self.test_loader):
                encoder_inputs, decoder_inputs, decoder_outputs = map(lambda x: x.to(self.device), batch)
                # |encoder_inputs| : (batch_size, seq_len), |decoder_inputs| : (batch_size, seq_len-1), |decoder_outputs| : (batch_size, seq_len-1)

                outputs, encoder_attns, decoder_attns, enc_dec_attns = self.model(encoder_inputs, decoder_inputs)
                # |outputs| : (batch_size, seq_len-1, tgt_vocab_size)
                # |encoder_attns| : [(batch_size, n_heads, seq_len, seq_len)] * n_layers
                # |decoder_attns| : [(batch_size, n_heads, seq_len-1, seq_len-1)] * n_layers
                # |enc_dec_attns| : [(batch_size, n_heads, seq_len-1, seq_len)] * n_layers
                
                loss = self.criterion(outputs.view(-1, self.tgt_vocab_size), decoder_outputs.view(-1))
                losses += loss.item()

        print('Valid Epoch: {}\t>\tLoss: {:.4f}'.format(epoch, losses/n_batches))

    def save(self, epoch, model_prefix='model', root='.model'):
        path = Path(root) / (model_prefix + '.ep%d' % epoch)
        if not path.parent.exists():
            path.parent.mkdir()
        
        torch.save(self.model, path)
Esempio n. 22
0
def main(tokenizer, src_tok_file, tgt_tok_file, train_file, val_file,
         test_file, num_epochs, batch_size, d_model, nhead, num_encoder_layers,
         num_decoder_layers, dim_feedforward, dropout, learning_rate,
         data_path, checkpoint_file, do_train):
    logging.info('Using tokenizer: {}'.format(tokenizer))

    src_tokenizer = TokenizerWrapper(tokenizer, BLANK_WORD, SEP_TOKEN,
                                     CLS_TOKEN, PAD_TOKEN, MASK_TOKEN)
    src_tokenizer.train(src_tok_file, 20000, SPECIAL_TOKENS)

    tgt_tokenizer = TokenizerWrapper(tokenizer, BLANK_WORD, SEP_TOKEN,
                                     CLS_TOKEN, PAD_TOKEN, MASK_TOKEN)
    tgt_tokenizer.train(tgt_tok_file, 20000, SPECIAL_TOKENS)

    SRC = ttdata.Field(tokenize=src_tokenizer.tokenize, pad_token=BLANK_WORD)
    TGT = ttdata.Field(tokenize=tgt_tokenizer.tokenize,
                       init_token=BOS_WORD,
                       eos_token=EOS_WORD,
                       pad_token=BLANK_WORD)

    logging.info('Loading training data...')
    train_ds, val_ds, test_ds = ttdata.TabularDataset.splits(
        path=data_path,
        format='tsv',
        train=train_file,
        validation=val_file,
        test=test_file,
        fields=[('src', SRC), ('tgt', TGT)])

    test_src_sentence = val_ds[0].src
    test_tgt_sentence = val_ds[0].tgt

    MIN_FREQ = 2
    SRC.build_vocab(train_ds.src, min_freq=MIN_FREQ)
    TGT.build_vocab(train_ds.tgt, min_freq=MIN_FREQ)

    logging.info(f'''SRC vocab size: {len(SRC.vocab)}''')
    logging.info(f'''TGT vocab size: {len(TGT.vocab)}''')

    train_iter = ttdata.BucketIterator(train_ds,
                                       batch_size=batch_size,
                                       repeat=False,
                                       sort_key=lambda x: len(x.src))
    val_iter = ttdata.BucketIterator(val_ds,
                                     batch_size=1,
                                     repeat=False,
                                     sort_key=lambda x: len(x.src))
    test_iter = ttdata.BucketIterator(test_ds,
                                      batch_size=1,
                                      repeat=False,
                                      sort_key=lambda x: len(x.src))

    source_vocab_length = len(SRC.vocab)
    target_vocab_length = len(TGT.vocab)

    model = Transformer(d_model=d_model,
                        nhead=nhead,
                        num_encoder_layers=num_encoder_layers,
                        num_decoder_layers=num_decoder_layers,
                        dim_feedforward=dim_feedforward,
                        dropout=dropout,
                        source_vocab_length=source_vocab_length,
                        target_vocab_length=target_vocab_length)
    optim = torch.optim.Adam(model.parameters(),
                             lr=learning_rate,
                             betas=(0.9, 0.98),
                             eps=1e-9)
    model = model.cuda()

    if do_train:
        train_losses, valid_losses = train(train_iter, val_iter, model, optim,
                                           num_epochs, batch_size,
                                           test_src_sentence,
                                           test_tgt_sentence, SRC, TGT,
                                           src_tokenizer, tgt_tokenizer,
                                           checkpoint_file)
    else:
        logging.info('Skipped training.')

    # Load best model and score test set
    logging.info('Loading best model.')
    model.load_state_dict(torch.load(checkpoint_file))
    model.eval()
    logging.info('Scoring the test set...')
    score_start = time.time()
    test_bleu, test_chrf = score(test_iter, model, tgt_tokenizer, SRC, TGT)
    score_time = time.time() - score_start
    logging.info(f'''Scoring complete in {score_time/60:.3f} minutes.''')
    logging.info(f'''BLEU : {test_bleu}''')
    logging.info(f'''CHRF : {test_chrf}''')
Esempio n. 23
0
                                bidir=True,
                                char_vocab_size=len(c2idx),
                                char_embed_dim=50,
                                dropout1=0.5,
                                dropout2=0,
                                dropout3=0.1)

Transformer_model = Transformer(emb=300 + 1024 + 250 + 30,
                                k=300,
                                heads=1,
                                depth=1,
                                num_classes=2,
                                char_vocab_size=len(c2idx),
                                char_embed_dim=50)

transformer_parameters = sum(p.numel() for p in Transformer_model.parameters()
                             if p.requires_grad)
rnn_parameters = sum(p.numel() for p in RNNseq_model.parameters()
                     if p.requires_grad)
total_parameters = transformer_parameters + rnn_parameters
print(f'Number of parameters: {total_parameters}')

# Move the model to the GPU if available
if using_GPU:
    RNNseq_model = RNNseq_model.cuda()
    Transformer_model = Transformer_model.cuda()

# Set up criterion for calculating loss
weight_tensor = torch.Tensor([1.0, 2.0]).cuda()
loss_criterion = nn.NLLLoss(weight=weight_tensor)
Esempio n. 24
0
model = Transformer(
    embedding_size,
    src_vocab_size,
    trg_vocab_size,
    src_pad_idx,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    max_len,
    device,
).to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                       factor=0.1,
                                                       patience=10,
                                                       verbose=True)

pad_idx = input_text.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

if load_model:
    load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)

sentence = "Emma Woodhouse, handsome, clever, and rich, with a comfortable home"  # Output should be: and happy disposition, seemed to unite some of the best blessings

if graph:
Esempio n. 25
0
def main() -> None:
    """Entry point.
    """
    # Load data
    vocab = tx.data.Vocab(config_data.vocab_file)
    data_hparams = {
        # "batch_size" is ignored for train since we use dynamic batching.
        "batch_size": config_data.test_batch_size,
        "pad_id": vocab.pad_token_id,
        "bos_id": vocab.bos_token_id,
        "eos_id": vocab.eos_token_id,
    }
    datasets = {
        split: data_utils.Seq2SeqData(
            os.path.join(config_data.input_dir,
                         f"{config_data.filename_prefix}{split}.npy"),
            # Only shuffle during training.
            hparams={
                **data_hparams, "shuffle": split == "train"
            },
        )
        for split in ["train", "valid", "test"]
    }
    print(f"Training data size: {len(datasets['train'])}")
    batching_strategy = data_utils.CustomBatchingStrategy(
        config_data.max_batch_tokens)

    # Create model and optimizer
    model = Transformer(config_model, config_data, vocab)
    model = ModelWrapper(model, config_model.beam_width)

    lr_config = config_model.lr_config
    if lr_config["learning_rate_schedule"] == "static":
        init_lr = lr_config["static_lr"]
        scheduler_lambda = lambda x: 1.0
    else:
        init_lr = lr_config["lr_constant"]
        scheduler_lambda = functools.partial(
            utils.get_lr_multiplier, warmup_steps=lr_config["warmup_steps"])
    optim = torch.optim.Adam(model.parameters(),
                             lr=init_lr,
                             betas=(0.9, 0.997),
                             eps=1e-9)
    scheduler = torch.optim.lr_scheduler.LambdaLR(optim, scheduler_lambda)

    output_dir = Path(args.output_dir)
    encoding = getattr(config_data, 'encoding', None)
    executor = Executor(
        model=model,
        train_data=datasets["train"],
        valid_data=datasets["valid"],
        test_data=datasets["test"],
        batching_strategy=batching_strategy,
        optimizer=optim,
        lr_scheduler=scheduler,
        log_destination=[sys.stdout, output_dir / "log.txt"],
        log_every=cond.iteration(config_data.display_steps),
        validate_every=[cond.iteration(config_data.eval_steps),
                        cond.epoch(1)],
        stop_training_on=cond.epoch(config_data.max_train_epoch),
        train_metrics=[
            ("loss", metric.RunningAverage(1)),  # only show current loss
            ("lr", metric.LR(optim))
        ],
        log_format="{time} : Epoch {epoch:2d} @ {iteration:6d}it "
        "({progress}%, {speed}), lr = {lr:.3e}, loss = {loss:.3f}",
        valid_metrics=BLEUWrapper(vocab, encoding=encoding),
        test_metrics=[
            FileBLEU(vocab, output_dir / "test.output", encoding=encoding),
            ("unofficial_bleu",
             BLEUWrapper(vocab, decode=True, encoding=encoding))
        ],
        valid_log_format="{time} : Epoch {epoch}, "
        "{split} BLEU = {BLEU:.3f}",
        test_progress_log_format=(
            "{time} : Evaluating on test ({progress}%, {speed}), "
            "unofficial BLEU = {unofficial_bleu:.2f}"),
        validate_mode='predict',
        checkpoint_dir=args.output_dir,
        save_every=cond.validation(better=True),
        max_to_keep=1,
        show_live_progress=True,
    )
    if args.run_mode == "train_and_evaluate":
        executor.write_log("Begin running with train_and_evaluate mode")
        if args.load_checkpoint:
            load_path = executor.load(allow_failure=True)
            if load_path is not None:
                executor.test({"valid": datasets["valid"]})

        executor.train()

    elif args.run_mode in ["evaluate", "test"]:
        executor.write_log(f"Begin running with {args.run_mode} mode")
        executor.load(load_training_state=False)
        split = "test" if args.run_mode == "test" else "valid"
        executor.test({split: datasets[split]})
    elif args.run_mode == 'infer':
        print("it's being developed.")

    else:
        raise ValueError(f"Unknown mode: {args.run_mode}")
Esempio n. 26
0
def transformer(dataloader, EPOCH, frequency, path_to_save_model,
                path_to_save_loss, path_to_save_predictions, device):

    device = torch.device(device)

    model = Transformer().double().to(device)
    optimizer = torch.optim.Adam(model.parameters())
    # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=200)
    criterion = torch.nn.MSELoss()
    best_model = ""
    min_train_loss = float('inf')

    for epoch in range(EPOCH + 1):

        train_loss = 0
        val_loss = 0

        ## TRAIN -- TEACHER FORCING
        model.train()
        for index_in, index_tar, _input, target, sensor_number in dataloader:  # for each data set

            optimizer.zero_grad()

            # Shape of _input : [batch, input_length, feature]
            # Desired input for model: [input_length, batch, feature]

            src = _input.permute(
                1, 0,
                2).double().to(device)[:-1, :, :]  # torch.Size([24, 1, 7])
            target = _input.permute(
                1, 0, 2).double().to(device)[1:, :, :]  # src shifted by 1.
            prediction = model(src, device)  # torch.Size([24, 1, 7])
            loss = criterion(prediction, target[:, :, 0].unsqueeze(-1))
            loss.backward()
            optimizer.step()
            # scheduler.step(loss.detach().item())
            train_loss += loss.detach().item()

        if train_loss < min_train_loss:
            torch.save(model.state_dict(),
                       path_to_save_model + f"best_train_{epoch}.pth")
            torch.save(optimizer.state_dict(),
                       path_to_save_model + f"optimizer_{epoch}.pth")
            min_train_loss = train_loss
            best_model = f"best_train_{epoch}.pth"

        if epoch % 100 == 0:  # Plot 1-Step Predictions

            logger.info(f"Epoch: {epoch}, Training loss: {train_loss}")
            scaler = load('scalar_item.joblib')
            src_humidity = scaler.inverse_transform(
                src[:, :, 0].cpu())  #torch.Size([35, 1, 7])
            target_humidity = scaler.inverse_transform(
                target[:, :, 0].cpu())  #torch.Size([35, 1, 7])
            prediction_humidity = scaler.inverse_transform(
                prediction[:, :,
                           0].detach().cpu().numpy())  #torch.Size([35, 1, 7])
            plot_training(epoch, path_to_save_predictions, src_humidity,
                          prediction_humidity, sensor_number, index_in,
                          index_tar)

        train_loss /= len(dataloader)
        log_loss(train_loss, path_to_save_loss, train=True)

    plot_loss(path_to_save_loss, train=True)
    return best_model
Esempio n. 27
0
if args.load_from is not None:
    with torch.cuda.device(args.gpu):
        model.load_state_dict(
            torch.load('./models/' + args.load_from + '.pt',
                       map_location=lambda storage, loc: storage.cuda())
        )  # load the pretrained models.

# if using a teacher
teacher_model = None
if args.teacher is not None:
    teacher_model = Transformer(SRC, TRG, teacher_args)
    with torch.cuda.device(args.gpu):
        teacher_model.load_state_dict(
            torch.load('./models/' + args.teacher + '.pt',
                       map_location=lambda storage, loc: storage.cuda()))
    for params in teacher_model.parameters():
        params.requires_grad = False

    if (args.share_encoder) and (args.load_from is None):
        model.encoder = copy.deepcopy(teacher_model.encoder)
        for params in model.encoder.parameters():
            params.requires_grad = True

# use cuda
if args.gpu > -1:
    model.cuda(args.gpu)
    if align_table is not None:
        align_table = torch.LongTensor(align_table).cuda(args.gpu)
        align_table = Variable(align_table)
        model.alignment = align_table
Esempio n. 28
0
def main(conf):
    conf.distributed = dist.get_world_size() > 1

    device = "cuda"

    if dist.is_primary():
        from pprint import pprint

        pprint(conf.dict())

    if dist.is_primary() and conf.evaluate.wandb:
        wandb = load_wandb()
        wandb.init(project="asr")

    else:
        wandb = None

    with open("trainval_indices.pkl", "rb") as f:
        split_indices = pickle.load(f)

    train_set = ASRDataset(
        conf.dataset.path,
        indices=split_indices["train"],
        alignment=conf.dataset.alignment,
    )
    valid_set = ASRDataset(conf.dataset.path, indices=split_indices["val"])

    train_sampler = dist.data_sampler(train_set,
                                      shuffle=True,
                                      distributed=conf.distributed)
    valid_sampler = dist.data_sampler(valid_set,
                                      shuffle=False,
                                      distributed=conf.distributed)

    if conf.training.batch_sampler is not None:
        train_lens = []

        for i in split_indices["train"]:
            train_lens.append(train_set.mel_lengths[i])

        opts = conf.training.batch_sampler

        bins = ((opts.base**np.linspace(opts.start, 1, 2 * opts.k + 1)) *
                1000).tolist()
        groups, bins, n_samples = create_groups(train_lens, bins)
        batch_sampler = GroupedBatchSampler(
            train_sampler, groups, conf.training.dataloader.batch_size)

        conf.training.dataloader.batch_size = 1
        train_loader = conf.training.dataloader.make(
            train_set,
            batch_sampler=batch_sampler,
            collate_fn=collate_data_imputer)

    else:
        train_loader = conf.training.dataloader.make(
            train_set, collate_fn=collate_data_imputer)

    valid_loader = conf.training.dataloader.make(valid_set,
                                                 sampler=valid_sampler,
                                                 collate_fn=collate_data)

    model = Transformer(
        conf.dataset.n_vocab,
        conf.model.delta,
        conf.dataset.n_mels,
        conf.model.feature_channel,
        conf.model.dim,
        conf.model.dim_ff,
        conf.model.n_layer,
        conf.model.n_head,
        conf.model.dropout,
    ).to(device)

    if conf.distributed:
        model = nn.parallel.DistributedDataParallel(
            model,
            device_ids=[dist.get_local_rank()],
            output_device=dist.get_local_rank(),
        )

    optimizer = conf.training.optimizer.make(model.parameters())
    scheduler = conf.training.scheduler.make(optimizer)

    if conf.ckpt is not None:
        ckpt = torch.load(conf.ckpt, map_location=lambda storage, loc: storage)

        model_p = model

        if conf.distributed:
            model_p = model.module

        model_p.load_state_dict(ckpt["model"])
        # scheduler.load_state_dict(ckpt["scheduler"])

        model_p.copy_embed(1)

    model_training = ModelTraining(
        model,
        optimizer,
        scheduler,
        train_set,
        train_loader,
        valid_loader,
        device,
        wandb,
    )

    train(conf, model_training)
Esempio n. 29
0
        train_dataset.cn_vocab_size,
        config.max_output_len,
        num_layers=config.n_layers,
        model_dim=config.model_dim,
        num_heads=config.num_heads,
        ffn_dim=config.ffn_dim,
        dropout=config.dropout,
    ).to(config.device)
    print("使用模型:")
    print(transformer_model)
    total_steps = 0
    if config.load_model:
        transformer_model.load_state_dict(torch.load(config.load_model_path))
        total_steps = int(re.split('[_/.]', config.model_file)[1])

    optimizer = torch.optim.Adam(transformer_model.parameters(),
                                 lr=config.learning_rate)
    loss_function = CrossEntropyLoss(ignore_index=0)

    train_losses, val_losses, bleu_scores = [], [], []

    while total_steps < config.num_steps:
        # 訓練模型
        transformer_model.train()
        transformer_model.zero_grad()
        losses = []
        loss_sum = 0.0
        for step in range(config.summary_steps):
            source, target = next(
                train_iter)  # sources targets[batch_size, max_output_len]
            source, target = source.to(config.device), target.to(config.device)
Esempio n. 30
0
    lang_vocab_file = join(args.data_dir, 'lang.vocab')
    lang_vocab, _ = ut.init_vocab(lang_vocab_file)
    args.lang_vocab_size = len(lang_vocab)

    # since args is passed to many modules, keep logger with it instead of reinit everytime
    log_file = join(dump_dir, 'DEBUG.log')
    logger = args.logger = ut.get_logger(log_file)

    # log args for future reference
    logger.info(args)

    model = Transformer(args)
    # TODO: nicer formatting?
    logger.info(model)
    param_count = sum([np.prod(p.size()) for p in model.parameters()])
    logger.info('Model has {:,} parameters'.format(param_count))

    # controller
    data_manager = DataManager(args)
    controller = Controller(args, model, data_manager)
    if args.mode == 'train':
        controller.train()
    elif args.mode == 'translate':
        controller.model.load_state_dict(torch.load(args.model_file))
        files_langs = args.files_langs
        for fl in files_langs:
            input_file, src_lang, tgt_lang = fl.split(',')
            controller.translate(input_file, src_lang, tgt_lang)
    else:
        raise ValueError('Unknown mode. Only train/translate.')