Ejemplo n.º 1
0
def test(args, config, resume=False):
    device = torch.device("cuda:{}".format(args.cuda) if args.cuda else 'cpu')
    source_language = args.source_file.split(".")[-1]
    target_language = args.target_file.split(".")[-1]

    src_path = os.path.join(ROOT_DIR, args.source_file)
    trg_path = os.path.join(ROOT_DIR, args.target_file)

    with open(src_path,
              "rt", encoding="utf8") as src, open(trg_path,
                                                  "rt",
                                                  encoding="utf8") as trg:
        src_lines = [line.strip().split(" ") for line in src.readlines()]
        trg_lines = [line.strip().split(" ") for line in trg.readlines()]

    tqdm.write("Total sentences: {:,}".format(len(src_lines)))

    # Load vocabulary
    if args.source_vocab and args.target_vocab:
        src_vocab = Vocabulary.load(args.source_vocab)
        trg_vocab = Vocabulary.load(args.target_vocab)
    # If not exist, build vocabulary from input files
    else:
        src_vocab = Vocabulary.build_vocabulary(corpus=src_lines,
                                                max_vocab_size=args.vocab_size,
                                                lang=source_language)
        trg_vocab = Vocabulary.build_vocabulary(corpus=trg_lines,
                                                max_vocab_size=args.vocab_size,
                                                lang=target_language)

        # If log_dir exists, save vocabulary
        if args.log_dir:
            src_vocab.save(
                os.path.join(ROOT_DIR, args.log_dir,
                             "{}.{}".format(len(src_vocab), source_language)))
            trg_vocab.save(
                os.path.join(ROOT_DIR, args.log_dir,
                             "{}.{}".format(len(trg_vocab), target_language)))

    datasets = SequenceDataset(src_lines, src_vocab, trg_lines, trg_vocab)
    dataloader = DataLoader(datasets,
                            batch_size=1,
                            shuffle=False,
                            num_workers=args.num_workers,
                            collate_fn=collate_fn,
                            drop_last=True)

    model = S2S(src_vocab_size=len(src_vocab),
                trg_vocab_size=len(trg_vocab),
                input_size=config["MODEL"]['embedding_size'],
                num_layers=config["MODEL"]['num_layers'],
                hidden_size=config["MODEL"]['hidden_size']).to(device=device)

    with tqdm(total=len(src_lines), desc="Sentences") as pbar:
        for step, (src_vec, src_len, trg_vec,
                   trg_len) in enumerate(dataloader):
            states = [
                torch.zeros(config["MODEL"]['num_layers'], len(src_vec),
                            config["MODEL"]['hidden_size']).to(device=device)
                for i in range(2)
            ]
            src_vec = torch.tensor(src_vec, dtype=torch.long).to(device=device)

            trg_input = [[trg_vocab['<SOS>']] + vec for vec in trg_vec]
            trg_vec = torch.tensor(trg_vec, dtype=torch.long).to(device=device)
            trg_input = torch.tensor(trg_input,
                                     dtype=torch.long).to(device=device)

            pbar.update(args.batch_size)

            flatten_output = model(src_vec, src_len, trg_input, trg_len,
                                   states)
            output = F.softmax(flatten_output, dim=1).argmax(dim=1).reshape(
                trg_vec.shape[0], -1)

            for s, t, o in zip(src_vec.tolist(), trg_vec.tolist(),
                               output.tolist()):
                tqdm.write("Source: {}".format(" ".join(
                    src_vocab.to_string(s))))
                tqdm.write("Target :{}".format(" ".join(
                    trg_vocab.to_string(t))))
                tqdm.write("Output: {}".format(" ".join(
                    trg_vocab.to_string(o))))
                tqdm.write("\n")
                break
            break
Ejemplo n.º 2
0
def train(args, config, resume: bool or str = False):
    device = torch.device("cuda:{}".format(args.cuda) if args.cuda else 'cpu')
    source_language = args.source_file.split(".")[-1]
    target_language = args.target_file.split(".")[-1]

    src_path = os.path.join(ROOT_DIR, args.source_file)
    trg_path = os.path.join(ROOT_DIR, args.target_file)

    with open(src_path,
              "rt", encoding="utf8") as src, open(trg_path,
                                                  "rt",
                                                  encoding="utf8") as trg:
        src_lines = [line.strip().split(" ") for line in src.readlines()]
        trg_lines = [line.strip().split(" ") for line in trg.readlines()]

    tqdm.write("Total sentences: {:,}".format(len(src_lines)))

    # Load vocabulary
    if args.source_vocab and args.target_vocab:
        src_vocab = Vocabulary.load(args.source_vocab)
        trg_vocab = Vocabulary.load(args.target_vocab)
    # If not exist, build vocabulary from input files
    else:
        src_vocab = Vocabulary.build_vocabulary(corpus=src_lines,
                                                max_vocab_size=args.vocab_size,
                                                lang=source_language)
        trg_vocab = Vocabulary.build_vocabulary(corpus=trg_lines,
                                                max_vocab_size=args.vocab_size,
                                                lang=target_language)

        # If log_dir exists, save vocabulary
        if args.log_dir:
            src_vocab.save(
                os.path.join(ROOT_DIR, args.log_dir,
                             "{}.{}".format(len(src_vocab), source_language)))
            trg_vocab.save(
                os.path.join(ROOT_DIR, args.log_dir,
                             "{}.{}".format(len(trg_vocab), target_language)))

    datasets = SequenceDataset(src_lines, src_vocab, trg_lines, trg_vocab)
    dataloader = DataLoader(datasets,
                            batch_size=args.batch_size,
                            shuffle=True,
                            num_workers=args.num_workers,
                            collate_fn=collate_fn,
                            drop_last=True)

    model = S2S(src_vocab_size=len(src_vocab),
                trg_vocab_size=len(trg_vocab),
                input_size=config["MODEL"]['embedding_size'],
                num_layers=config["MODEL"]['num_layers'],
                hidden_size=config["MODEL"]['hidden_size']).to(device=device)

    criterion = nn.CrossEntropyLoss(
        ignore_index=Vocabulary.reversed_basic_tokens['<PAD>'],
        reduction='sum')
    optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)

    # Load epochs
    start_epoch = 0
    if resume:
        start_epoch, model, optimizer = load_checkpoint(
            os.path.join(ROOT_DIR, resume), model, optimizer)

    epochs = range(start_epoch, args.epoch)
    states = [
        torch.zeros(config["MODEL"]['num_layers'], args.batch_size,
                    config["MODEL"]['hidden_size']).to(device=device)
        for i in range(2)
    ]
    for epoch in tqdm(epochs, desc='EPOCH', leave=True):
        with tqdm(total=len(src_lines), desc="Sents", leave=False) as pbar:
            for step, (src_vec, src_len, trg_vec,
                       trg_len) in enumerate(dataloader):
                # For CUDA optimization, 10 iterations speed up
                for state in states:
                    state.zero_()

                src_vec = torch.tensor(src_vec,
                                       dtype=torch.long).to(device=device)

                trg_input = [[trg_vocab['<SOS>']] + vec for vec in trg_vec]
                # For CUDA optimization, 1~2 iterations speed up
                trg_vec = torch.tensor(trg_vec,
                                       dtype=torch.long).to(device=device)
                trg_input = torch.tensor(trg_input,
                                         dtype=torch.long).to(device=device)

                pbar.update(args.batch_size)
                model.zero_grad()

                flatten_output = model(src_vec, src_len, trg_input, trg_len,
                                       states)
                flatten_trg = trg_vec.reshape(-1)

                loss = criterion(flatten_output, flatten_trg)
                loss /= src_vec.shape[0]
                loss.backward()
                clip_grad_norm_(model.parameters(), 5.0)
                optimizer.step()

                if step % 100 == 0:
                    tqdm.write("Step: {:,} Loss: {:,}".format(
                        step, float(loss)))

        # Decay learning rate
        if epoch >= 1:
            lr = args.learning_rate * (0.5**(epoch - 1))
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr

        if args.log_dir is not None:
            path = os.path.join(ROOT_DIR, args.log_dir)
            if not os.path.exists(path):
                os.makedirs(path)

            state = {
                'epoch': epoch,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict()
            }
            torch.save(state, os.path.join(path, "{}.ckpt".format(epoch)))
            tqdm.write("[+] {}.ckpt saved".format(epoch))