Esempio n. 1
0
    def __init__(self, args):

        logger.info(args.dump())

        logger.info('initializing')
        self.args = args
        if args.gpu:
            torch.cuda.set_device(args.gpu)
            self.use_cuda = torch.cuda.is_available()
        else:
            self.use_cuda = False

        src, tgt = utils.get_corpus('./small_parallel_enja/train.src',
                                    './small_parallel_enja/train.tgt')

        # build or load vocab
        if hasattr(args, 'load_vocab_dir'):
            if args.load_vocab_dir:
                lvd_ = args.load_vocab_dir
                logger.info('loading vocab from {}'.format(lvd_))
                self.load_vocab()
        else:
            logger.info('builing vocab')
            self.build_vocab(src, tgt)
            logger.info('saving vocab to {}'.format(args.output_dir))
            self.save_vocab()

        logger.info('building dataset')
        self.train_data = utils.build_dataset(src, tgt, self.s_w2i, self.t_w2i)
        src, tgt = utils.get_corpus('./small_parallel_enja/valid.src',
                                    './small_parallel_enja/valid.tgt')
        self.test_data = utils.build_dataset(src, tgt, self.s_w2i, self.t_w2i)

        logger.info('preparing encoder and decoder')
        encoder = models.Encoder(len(self.s_w2i), args.embedding_size,
                                 args.hidden_size, args.n_layers, args.bidirec)
        decoder = models.Decoder(len(self.t_w2i), args.embedding_size,
                                 args.hidden_size * 2, args.n_layers)
        logger.info('initializing weight')
        encoder.init_weight()
        decoder.init_weight()

        if self.use_cuda:
            logger.info('use cuda')
            self.encoder = encoder.cuda()
            self.decoder = decoder.cuda()
        else:
            logger.info('no cuda')
            self.encoder = encoder
            self.decoder = decoder

        logger.info('set loss function and optimizers')
        self.loss_function = nn.CrossEntropyLoss(ignore_index=0)
        self.enc_optim = optim.Adam(self.encoder.parameters(), lr=args.lr)
        self.dec_optim = optim.Adam(self.decoder.parameters(), lr=args.lr)

        self.bleus_es = [-10000]
        self.patient_es = 0

        del src, tgt
Esempio n. 2
0
    def __init__(self, args):
        self.args = args
        train_dataloader, test_dataloader =\
            get_dataloaders(args.train_src,
                            args.train_tgt,
                            args.valid_src,
                            args.valid_tgt,
                            args.batch_size,
                            args.src_vocab_size,
                            args.tgt_vocab_size)
        self.train_dataloader = train_dataloader
        self.test_dataloader = test_dataloader
        self.sw2i = train_dataloader.dataset.sw2i
        self.si2w = train_dataloader.dataset.si2w
        self.tw2i = train_dataloader.dataset.tw2i
        self.ti2w = train_dataloader.dataset.ti2w

        encoder = models.Encoder(len(self.sw2i),
                                 args.src_embedding_size,
                                 args.encoder_hidden_n,
                                 n_layers=args.encoder_num_layers,
                                 bidirec=args.encoder_bidirectional,
                                 use_cuda=args.use_cuda)
        if args.decoder_bidirectional:
            decoder_hidden_size = args.decoder_hidden_n * 2
        else:
            decoder_hidden_size = args.decoder_hidden_n
        decoder = models.Decoder(len(self.tw2i),
                                 args.tgt_embedding_size,
                                 decoder_hidden_size,
                                 n_layers=args.decoder_num_layers,
                                 use_cuda=args.use_cuda)
        src_embedder = models.Embedder(len(self.sw2i),
                                       args.src_embedding_size,
                                       args.use_cuda)
        tgt_embedder = models.Embedder(len(self.tw2i),
                                       args.tgt_embedding_size,
                                       args.use_cuda)
        encoder.init_weight()
        decoder.init_weight()
        if args.use_cuda:
            encoder = encoder.cuda()
            decoder = decoder.cuda()
            src_embedder = src_embedder.cuda()
            tgt_embedder = tgt_embedder.cuda()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embedder = src_embedder
        self.tgt_embedder = tgt_embedder

        self.loss_func = nn.CrossEntropyLoss(ignore_index=0)
        self.enc_optim = optim.Adam(encoder.parameters(), lr=args.lr)
        self.dec_optim = optim.Adam(decoder.parameters(), lr=args.lr)
        self.src_embedder_optim = optim.Adam(self.src_embedder.parameters(),
                                             lr=args.lr)
        self.tgt_embedder_optim = optim.Adam(self.tgt_embedder.parameters(),
                                             lr=args.lr)
Esempio n. 3
0
    def __init__(self, args):
        self.args = args
        train_dataloader, test_dataloader =\
            get_dataloaders(args.data_dir,
                            args.src_lang,
                            args.tgt_lang,
                            args.batch_size,
                            args.src_vocab_size,
                            args.tgt_vocab_size)

        self.train_dataloader = train_dataloader
        self.test_dataloader = test_dataloader
        self.src_vocab = self.train_dataloader.dataset.src_vocab
        self.src_w2i = self.train_dataloader.dataset.src_w2i
        self.src_i2w = self.train_dataloader.dataset.src_i2w
        self.tgt_vocab = self.train_dataloader.dataset.tgt_vocab
        self.tgt_w2i = self.train_dataloader.dataset.tgt_w2i
        self.tgt_i2w = self.train_dataloader.dataset.tgt_i2w

        # model
        encoder = models.Encoder(len(self.src_vocab), args.src_embedding_size,
                                 self.src_w2i['<PAD>'], args.encoder_dropout_p,
                                 args.encoder_hidden_n,
                                 args.encoder_num_layers,
                                 args.encoder_bidirectional, args.use_cuda)
        decoder = models.Decoder(len(self.tgt_vocab), args.tgt_embedding_size,
                                 self.tgt_w2i['<PAD>'], args.decoder_dropout_p,
                                 args.decoder_hidden_n,
                                 args.decoder_num_layers,
                                 args.decoder_bidirectional, args.use_cuda)
        if args.use_cuda:
            encoder.cuda()
            decoder.cuda()
        self.encoder = encoder
        self.decoder = decoder

        # optimizer
        self.enc_optim = optim.SGD(self.encoder.parameters(), args.lr)
        self.dec_optim = optim.SGD(self.decoder.parameters(), args.lr)
Esempio n. 4
0
    def __init__(self, args):
        self.args = args
        train_dataloader, test_dataloader =\
            get_dataloaders(args.data_dir,
                            args.src_lang,
                            args.tgt_lang,
                            args.batch_size,
                            args.src_vocab_size,
                            args.tgt_vocab_size)
        self.train_dataloader = train_dataloader
        self.test_dataloader = test_dataloader
        self.sw2i = train_dataloader.dataset.sw2i
        self.si2w = train_dataloader.dataset.si2w
        self.tw2i = train_dataloader.dataset.tw2i
        self.ti2w = train_dataloader.dataset.ti2w
        self.converters = {
                'src': {
                    'w2i': self.sw2i,
                    'i2w': self.si2w
                    },
                'tgt': {
                    'w2i': self.tw2i,
                    'i2w': self.ti2w
                    }
                }

        vocab_size = max(len(self.sw2i), len(self.tw2i))
        print('global vocab size: %d' % vocab_size)

        encoder = models.Encoder(vocab_size,
                                 args.src_embedding_size,
                                 args.encoder_hidden_n,
                                 n_layers=args.encoder_num_layers,
                                 bidirec=args.encoder_bidirectional,
                                 use_cuda=args.use_cuda)
        if args.decoder_bidirectional:
            decoder_hidden_size = args.decoder_hidden_n * 2
        else:
            decoder_hidden_size = args.decoder_hidden_n
        decoder = models.Decoder(vocab_size,
                                 args.tgt_embedding_size,
                                 decoder_hidden_size,
                                 n_layers=args.decoder_num_layers,
                                 use_cuda=args.use_cuda)
        src_embedder = models.Embedder(vocab_size,
                                       args.src_embedding_size,
                                       args.use_cuda)
        tgt_embedder = models.Embedder(vocab_size,
                                       args.tgt_embedding_size,
                                       args.use_cuda)
        encoder.init_weight()
        decoder.init_weight()
        if args.use_cuda:
            encoder = encoder.cuda()
            decoder = decoder.cuda()
            src_embedder = src_embedder.cuda()
            tgt_embedder = tgt_embedder.cuda()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embedder = src_embedder
        self.tgt_embedder = tgt_embedder
        self.embedders = {
                'src': src_embedder,
                'tgt': tgt_embedder
                }
        self.loss_func = nn.CrossEntropyLoss(ignore_index=0)
        self.enc_optim = optim.Adam(encoder.parameters(), lr=args.lr)
        self.dec_optim = optim.Adam(decoder.parameters(), lr=args.lr)
        self.src_embedder_optim = optim.Adam(self.src_embedder.parameters(),
                                             lr=args.lr)
        self.tgt_embedder_optim = optim.Adam(self.tgt_embedder.parameters(),
                                             lr=args.lr)
        self.optims = {
                'src': self.src_embedder_optim,
                'tgt': self.tgt_embedder_optim,
                }

        # Discriminator
        discriminator = models.Discriminator(args.encoder_hidden_n)
        self.disc_loss_func = nn.BCELoss()
        if args.use_cuda:
            self.discriminator = discriminator.cuda()
        else:
            self.discriminator = discriminator
        self.disc_optim = optim.Adam(self.discriminator.parameters(),
                                     lr=args.disc_lr)

        # Set bilingual dictionary
        self.bi_dict =\
            bilingual_dictionary.Dictionary(args.bilingual_dict_path)
Esempio n. 5
0
def main(args):
    cp = torch.load(args.checkpoint)
    cargs = cp['args']
    sw2i = cp['sw2i']
    tw2i = cp['tw2i']
    ti2w = cp['ti2w']

    device = torch.device('cuda' if args.use_cuda else 'cpu')

    encoder = models.Encoder(len(sw2i),
                             cargs.src_embedding_size,
                             cargs.encoder_hidden_n,
                             n_layers=cargs.encoder_num_layers,
                             bidirec=cargs.encoder_bidirectional,
                             use_cuda=args.use_cuda)

    if cargs.decoder_bidirectional:
        decoder_hidden_size = cargs.decoder_hidden_n * 2
    else:
        decoder_hidden_size = cargs.decoder_hidden_n

    decoder = models.Decoder(len(tw2i),
                             cargs.tgt_embedding_size,
                             decoder_hidden_size,
                             n_layers=cargs.decoder_num_layers,
                             use_cuda=args.use_cuda)
    src_embedder = models.Embedder(len(sw2i), cargs.src_embedding_size,
                                   args.use_cuda)
    tgt_embedder = models.Embedder(len(tw2i), cargs.tgt_embedding_size,
                                   args.use_cuda)

    encoder.load_state_dict(cp['encoder_state_dict'])
    decoder.load_state_dict(cp['decoder_state_dict'])
    src_embedder.load_state_dict(cp['src_embedder'])
    tgt_embedder.load_state_dict(cp['tgt_embedder'])

    encoder.to(device)
    decoder.to(device)
    src_embedder.to(device)
    tgt_embedder.to(device)

    # data
    lines = open(args.src, 'r', encoding='utf-8').readlines()
    lines = [l.strip() + ' </s>' for l in lines]
    if args.src.find('ja') == -1:
        X = [utils.normalize_string(line).split() for line in lines]
    else:
        X = [line.lower().split() for line in lines]

    translated = []
    for x in X:
        idxs = list(map(lambda w: sw2i.get(w, sw2i['<UNK>']), x))
        idxs = torch.tensor([idxs], device=device)
        length = idxs.size(1)
        output, hidden_c = encoder(src_embedder, idxs, [length])
        start_decode =\
            torch.tensor([[tw2i['<s>']] * 1]).to(device)

        # preds: 50, V
        preds = decoder(tgt_embedder, start_decode, hidden_c, 50, output, None,
                        False)

        # preds_max: 50, V
        preds_max = torch.max(preds, 1)[1]
        sent = ' '.join([ti2w[p] for p in preds_max.data.tolist()])
        translated.append(sent)

    return translated