def __init__(self, args): logger.info(args.dump()) logger.info('initializing') self.args = args if args.gpu: torch.cuda.set_device(args.gpu) self.use_cuda = torch.cuda.is_available() else: self.use_cuda = False src, tgt = utils.get_corpus('./small_parallel_enja/train.src', './small_parallel_enja/train.tgt') # build or load vocab if hasattr(args, 'load_vocab_dir'): if args.load_vocab_dir: lvd_ = args.load_vocab_dir logger.info('loading vocab from {}'.format(lvd_)) self.load_vocab() else: logger.info('builing vocab') self.build_vocab(src, tgt) logger.info('saving vocab to {}'.format(args.output_dir)) self.save_vocab() logger.info('building dataset') self.train_data = utils.build_dataset(src, tgt, self.s_w2i, self.t_w2i) src, tgt = utils.get_corpus('./small_parallel_enja/valid.src', './small_parallel_enja/valid.tgt') self.test_data = utils.build_dataset(src, tgt, self.s_w2i, self.t_w2i) logger.info('preparing encoder and decoder') encoder = models.Encoder(len(self.s_w2i), args.embedding_size, args.hidden_size, args.n_layers, args.bidirec) decoder = models.Decoder(len(self.t_w2i), args.embedding_size, args.hidden_size * 2, args.n_layers) logger.info('initializing weight') encoder.init_weight() decoder.init_weight() if self.use_cuda: logger.info('use cuda') self.encoder = encoder.cuda() self.decoder = decoder.cuda() else: logger.info('no cuda') self.encoder = encoder self.decoder = decoder logger.info('set loss function and optimizers') self.loss_function = nn.CrossEntropyLoss(ignore_index=0) self.enc_optim = optim.Adam(self.encoder.parameters(), lr=args.lr) self.dec_optim = optim.Adam(self.decoder.parameters(), lr=args.lr) self.bleus_es = [-10000] self.patient_es = 0 del src, tgt
def __init__(self, args): self.args = args train_dataloader, test_dataloader =\ get_dataloaders(args.train_src, args.train_tgt, args.valid_src, args.valid_tgt, args.batch_size, args.src_vocab_size, args.tgt_vocab_size) self.train_dataloader = train_dataloader self.test_dataloader = test_dataloader self.sw2i = train_dataloader.dataset.sw2i self.si2w = train_dataloader.dataset.si2w self.tw2i = train_dataloader.dataset.tw2i self.ti2w = train_dataloader.dataset.ti2w encoder = models.Encoder(len(self.sw2i), args.src_embedding_size, args.encoder_hidden_n, n_layers=args.encoder_num_layers, bidirec=args.encoder_bidirectional, use_cuda=args.use_cuda) if args.decoder_bidirectional: decoder_hidden_size = args.decoder_hidden_n * 2 else: decoder_hidden_size = args.decoder_hidden_n decoder = models.Decoder(len(self.tw2i), args.tgt_embedding_size, decoder_hidden_size, n_layers=args.decoder_num_layers, use_cuda=args.use_cuda) src_embedder = models.Embedder(len(self.sw2i), args.src_embedding_size, args.use_cuda) tgt_embedder = models.Embedder(len(self.tw2i), args.tgt_embedding_size, args.use_cuda) encoder.init_weight() decoder.init_weight() if args.use_cuda: encoder = encoder.cuda() decoder = decoder.cuda() src_embedder = src_embedder.cuda() tgt_embedder = tgt_embedder.cuda() self.encoder = encoder self.decoder = decoder self.src_embedder = src_embedder self.tgt_embedder = tgt_embedder self.loss_func = nn.CrossEntropyLoss(ignore_index=0) self.enc_optim = optim.Adam(encoder.parameters(), lr=args.lr) self.dec_optim = optim.Adam(decoder.parameters(), lr=args.lr) self.src_embedder_optim = optim.Adam(self.src_embedder.parameters(), lr=args.lr) self.tgt_embedder_optim = optim.Adam(self.tgt_embedder.parameters(), lr=args.lr)
def __init__(self, args): self.args = args train_dataloader, test_dataloader =\ get_dataloaders(args.data_dir, args.src_lang, args.tgt_lang, args.batch_size, args.src_vocab_size, args.tgt_vocab_size) self.train_dataloader = train_dataloader self.test_dataloader = test_dataloader self.src_vocab = self.train_dataloader.dataset.src_vocab self.src_w2i = self.train_dataloader.dataset.src_w2i self.src_i2w = self.train_dataloader.dataset.src_i2w self.tgt_vocab = self.train_dataloader.dataset.tgt_vocab self.tgt_w2i = self.train_dataloader.dataset.tgt_w2i self.tgt_i2w = self.train_dataloader.dataset.tgt_i2w # model encoder = models.Encoder(len(self.src_vocab), args.src_embedding_size, self.src_w2i['<PAD>'], args.encoder_dropout_p, args.encoder_hidden_n, args.encoder_num_layers, args.encoder_bidirectional, args.use_cuda) decoder = models.Decoder(len(self.tgt_vocab), args.tgt_embedding_size, self.tgt_w2i['<PAD>'], args.decoder_dropout_p, args.decoder_hidden_n, args.decoder_num_layers, args.decoder_bidirectional, args.use_cuda) if args.use_cuda: encoder.cuda() decoder.cuda() self.encoder = encoder self.decoder = decoder # optimizer self.enc_optim = optim.SGD(self.encoder.parameters(), args.lr) self.dec_optim = optim.SGD(self.decoder.parameters(), args.lr)
def __init__(self, args): self.args = args train_dataloader, test_dataloader =\ get_dataloaders(args.data_dir, args.src_lang, args.tgt_lang, args.batch_size, args.src_vocab_size, args.tgt_vocab_size) self.train_dataloader = train_dataloader self.test_dataloader = test_dataloader self.sw2i = train_dataloader.dataset.sw2i self.si2w = train_dataloader.dataset.si2w self.tw2i = train_dataloader.dataset.tw2i self.ti2w = train_dataloader.dataset.ti2w self.converters = { 'src': { 'w2i': self.sw2i, 'i2w': self.si2w }, 'tgt': { 'w2i': self.tw2i, 'i2w': self.ti2w } } vocab_size = max(len(self.sw2i), len(self.tw2i)) print('global vocab size: %d' % vocab_size) encoder = models.Encoder(vocab_size, args.src_embedding_size, args.encoder_hidden_n, n_layers=args.encoder_num_layers, bidirec=args.encoder_bidirectional, use_cuda=args.use_cuda) if args.decoder_bidirectional: decoder_hidden_size = args.decoder_hidden_n * 2 else: decoder_hidden_size = args.decoder_hidden_n decoder = models.Decoder(vocab_size, args.tgt_embedding_size, decoder_hidden_size, n_layers=args.decoder_num_layers, use_cuda=args.use_cuda) src_embedder = models.Embedder(vocab_size, args.src_embedding_size, args.use_cuda) tgt_embedder = models.Embedder(vocab_size, args.tgt_embedding_size, args.use_cuda) encoder.init_weight() decoder.init_weight() if args.use_cuda: encoder = encoder.cuda() decoder = decoder.cuda() src_embedder = src_embedder.cuda() tgt_embedder = tgt_embedder.cuda() self.encoder = encoder self.decoder = decoder self.src_embedder = src_embedder self.tgt_embedder = tgt_embedder self.embedders = { 'src': src_embedder, 'tgt': tgt_embedder } self.loss_func = nn.CrossEntropyLoss(ignore_index=0) self.enc_optim = optim.Adam(encoder.parameters(), lr=args.lr) self.dec_optim = optim.Adam(decoder.parameters(), lr=args.lr) self.src_embedder_optim = optim.Adam(self.src_embedder.parameters(), lr=args.lr) self.tgt_embedder_optim = optim.Adam(self.tgt_embedder.parameters(), lr=args.lr) self.optims = { 'src': self.src_embedder_optim, 'tgt': self.tgt_embedder_optim, } # Discriminator discriminator = models.Discriminator(args.encoder_hidden_n) self.disc_loss_func = nn.BCELoss() if args.use_cuda: self.discriminator = discriminator.cuda() else: self.discriminator = discriminator self.disc_optim = optim.Adam(self.discriminator.parameters(), lr=args.disc_lr) # Set bilingual dictionary self.bi_dict =\ bilingual_dictionary.Dictionary(args.bilingual_dict_path)
def main(args): cp = torch.load(args.checkpoint) cargs = cp['args'] sw2i = cp['sw2i'] tw2i = cp['tw2i'] ti2w = cp['ti2w'] device = torch.device('cuda' if args.use_cuda else 'cpu') encoder = models.Encoder(len(sw2i), cargs.src_embedding_size, cargs.encoder_hidden_n, n_layers=cargs.encoder_num_layers, bidirec=cargs.encoder_bidirectional, use_cuda=args.use_cuda) if cargs.decoder_bidirectional: decoder_hidden_size = cargs.decoder_hidden_n * 2 else: decoder_hidden_size = cargs.decoder_hidden_n decoder = models.Decoder(len(tw2i), cargs.tgt_embedding_size, decoder_hidden_size, n_layers=cargs.decoder_num_layers, use_cuda=args.use_cuda) src_embedder = models.Embedder(len(sw2i), cargs.src_embedding_size, args.use_cuda) tgt_embedder = models.Embedder(len(tw2i), cargs.tgt_embedding_size, args.use_cuda) encoder.load_state_dict(cp['encoder_state_dict']) decoder.load_state_dict(cp['decoder_state_dict']) src_embedder.load_state_dict(cp['src_embedder']) tgt_embedder.load_state_dict(cp['tgt_embedder']) encoder.to(device) decoder.to(device) src_embedder.to(device) tgt_embedder.to(device) # data lines = open(args.src, 'r', encoding='utf-8').readlines() lines = [l.strip() + ' </s>' for l in lines] if args.src.find('ja') == -1: X = [utils.normalize_string(line).split() for line in lines] else: X = [line.lower().split() for line in lines] translated = [] for x in X: idxs = list(map(lambda w: sw2i.get(w, sw2i['<UNK>']), x)) idxs = torch.tensor([idxs], device=device) length = idxs.size(1) output, hidden_c = encoder(src_embedder, idxs, [length]) start_decode =\ torch.tensor([[tw2i['<s>']] * 1]).to(device) # preds: 50, V preds = decoder(tgt_embedder, start_decode, hidden_c, 50, output, None, False) # preds_max: 50, V preds_max = torch.max(preds, 1)[1] sent = ' '.join([ti2w[p] for p in preds_max.data.tolist()]) translated.append(sent) return translated