def load_dataset(data_path, train_batch_size=4096, dev_batch_size=1, max_len=100): """ This assumes that the data is already pre-processed using Moses Tokenizer Returns iterators for the training/dev dataset Arguments: data_path: path of the dataset train_batch_size: batch size of the training data (defined in terms of number of tokens or sentences, depending on the model_type) dev_batch_size: batch size of the dev data (usually one) max_len: max length of sequeences in a batch """ SRC = Field(tokenize=lambda s: s.split(), init_token="<s>", eos_token="</s>", batch_first=True, include_lengths=True) TRG = Field(tokenize=lambda s: s.split(), init_token="<s>", eos_token="</s>", batch_first=True, include_lengths=True) # create a TranslationDataset for both the train and dev set train_data = datasets.TranslationDataset(exts=("train.de", "train.en"), fields=( SRC, TRG), path=data_path, filter_pred=lambda x: len(vars(x)['src']) <= max_len and len(vars(x)['trg']) <= max_len) dev_data = datasets.TranslationDataset( exts=("dev.de", "dev.en"), fields=(SRC, TRG), path=data_path) # load in the Test Set test_examples = [] with open(data_path + "test.de", "r") as f: for test_example in f.readlines(): example = data.Example() setattr(example, "src", test_example.split()) test_examples.append(example) test_data = data.Dataset(test_examples, fields=[("src", SRC)]) # build he vocab using the training data SRC.build_vocab(train_data.src, train_data.trg) TRG.build_vocab(train_data.src, train_data.trg) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # use custom DataIterator in order to minimize padding in a sequence # and inoder to `pack` a batch fully inorder to maximmize the computation # in a GPU train_iterator = DataIterator(train_data, batch_size=train_batch_size, device=device, repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)), batch_size_fn=batch_size_fn, train=True, sort_within_batch=True, shuffle=True) # use a regular Iterator since we want to be able to compare # our translations to a gold standard file. If we use a # `DataIterator` then we will get our translations in shuffled/random # order dev_iterator = Iterator(dev_data, batch_size=dev_batch_size, train=False, sort=False, repeat=False, device=device) # create Test Iterator for the test data test_iterator = Iterator( test_data, batch_size=1, train=False, sort=False, repeat=False, device=device) print(len(test_iterator)) return train_iterator, dev_iterator, test_iterator, SRC, TRG
def get_mt_datasets(exts, fields, train_path, val_path, test_path): train = datasets.TranslationDataset(path=train_path, exts=exts, fields=fields) val = datasets.TranslationDataset(path=val_path, exts=exts, fields=fields) test = datasets.TranslationDataset(path=test_path, exts=exts, fields=fields) return train, val, test
def build_field_dataset_vocab(data_directory, src_name, trg_name, vocab): tokenize = lambda x: x.split() # 定义field,这里source与target共用vocab字典 source = data.Field( sequential=True, tokenize=tokenize, lower=True, use_vocab=True, init_token='<sos>', eos_token='<eos>', pad_token='<pad>', unk_token='<unk>', batch_first=True, fix_length=50) #include_lengths=True为方便之后使用torch的pack_padded_sequence # 定义数据集 train_data = datasets.TranslationDataset( path=data_directory, exts=(src_name, trg_name), fields=(source, source)) # source与target共用vocab可使用同一个Fields # 创建词汇表 if vocab is None: source.build_vocab(train_data, min_freq=2) else: source.vocab = vocab return source, train_data
def load_data(lang_dir, src_ext, tgt_ext, src_path=None, tgt_path=None): BOS_WORD = '<s>' EOS_WORD = '</s>' BLANK_WORD = '<pad>' SRC = data.Field( tokenize=tokenize_es, init_token=BOS_WORD, eos_token=EOS_WORD, pad_token=BLANK_WORD) if src_path is None else load_field(src_path) TGT = data.Field( tokenize=tokenize_en, init_token=BOS_WORD, eos_token=EOS_WORD, pad_token=BLANK_WORD) if tgt_path is None else load_field(tgt_path) print("Loading data...") dataset = datasets.TranslationDataset( lang_dir, (src_ext, tgt_ext), (SRC, TGT), filter_pred=lambda x: len(vars(x)['src']) <= 100 and len( vars(x)['trg']) <= 100) print("Data loaded!") train, valid, test = dataset.split(split_ratio=[0.7, 0.15, 0.15]) if src_path is None: SRC.build_vocab(train.src, min_freq=2, max_size=39996) if tgt_path is None: TGT.build_vocab(train.trg, min_freq=2, max_size=39996) return SRC, TGT, train, valid, test
def load_data(lang1='de', lang2='en', directory=None): lang1_tokenizer = get_tokenizer(lang1) lang2_tokenizer = get_tokenizer(lang2) SRC = data.Field(tokenize=lang1_tokenizer, pad_token=BLANK_WORD) TGT = data.Field(tokenize=lang2_tokenizer, init_token=BOS_WORD, eos_token=EOS_WORD, pad_token=BLANK_WORD) MAX_LEN = 100 if directory: train, val = datasets.TranslationDataset( path=directory, exts=('.' + lang1, '.' + lang2), fields=(SRC, TGT), filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and len( vars(x)['trg']) <= MAX_LEN).split() else: train, val, test = datasets.IWSLT.splits( exts=('.de', '.en'), fields=(SRC, TGT), filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and len( vars(x)['trg']) <= MAX_LEN) MIN_FREQ = 2 SRC.build_vocab(train.src, min_freq=MIN_FREQ) TGT.build_vocab(train.trg, min_freq=MIN_FREQ) return train, val, SRC, TGT # todo find out exactly what each of these variables are
def __init__(self, path, SRC, TGT, exts=('.de', '.en'), UNK='<unk>', SOS='<s>', EOS='</s>', TMAX=100): ''' ''' self.ds = datasets.TranslationDataset( path=path, exts=exts, fields=(SRC, TGT), filter_pred=lambda x: len(x.src) < TMAX and len(x.trg) < TMAX) self.src = self.ds.src self.tgt = self.ds.trg self.SRC = SRC self.TGT = TGT self.sos = SOS self.unk = UNK self.eos = EOS
def get_data(file_path, MIN_FREQ=2, DEVICE_SET=None): BOS_WORD = '<s>' EOS_WORD = '</s>' PAD_WORD = "<pad>" field_en = data.Field(sequential=True, use_vocab=True, batch_first=True, tokenize=str.split, init_token=BOS_WORD, eos_token=EOS_WORD, pad_token=PAD_WORD) field_de = data.Field(sequential=True, use_vocab=True, batch_first=True, tokenize=str.split, init_token=BOS_WORD, eos_token=EOS_WORD, pad_token=PAD_WORD) trn = datasets.TranslationDataset(path=file_path, exts=('en', 'de'), fields=[('en', field_en), ('de', field_de)]) field_en.build_vocab(trn.en, min_freq=MIN_FREQ) field_de.build_vocab(trn.de, min_freq=MIN_FREQ) return field_en, field_de, trn
def build_field_dataset_vocab(data_directory, src_name, trg_name, vocab, field_include_length=True, oov=False): tokenize = lambda x: x.split() # 定义field,这里source与target共用vocab字典 source = data.Field( sequential=True, tokenize=tokenize, lower=True, use_vocab=True, init_token='<sos>', eos_token='<eos>', pad_token='<pad>', unk_token='<unk>', batch_first=True, fix_length=50, include_lengths=field_include_length ) #include_lengths=True为方便之后使用torch的pack_padded_sequence # 定义数据集 train_data = datasets.TranslationDataset( path=data_directory, exts=(src_name, trg_name), fields=(source, source)) # source与target共用vocab可使用同一个Fields # 创建词汇表 if vocab is None: source.build_vocab(train_data, min_freq=2) # 词频少于2的将被映射为<unk> else: source.vocab = vocab #获取训练集中的oov 词 if oov: oov_words = get_oov_words(train_data, source.vocab.stoi) # 划分训练与验证集,一个问题,利用random_split进行数据集划分后,会丢失fields属性 train_set, val_set = train_data.split(split_ratio=0.95, random_state=random.seed(1)) BATCH_SIZE = 256 # 生成训练与验证集的迭代器 train_iterator, val_iterator = data.BucketIterator.splits( (train_set, val_set), batch_size=BATCH_SIZE, # shuffle=True, # device=device, sort_within_batch=True, #为true则一个batch内的数据会按sort_key规则降序排序 sort_key=lambda x: len(x.src) #这里按src的长度降序排序,主要是为后面pack,pad操作 # repeat=False ) if oov: return source, train_iterator, val_iterator, oov_words else: return source, train_iterator, val_iterator
def get_dataloader(train_data_base: str, val_data_base: str, test_data_base: str, ext: Dict): # load english en_vocab = Vocab() train_data_en = train_data_base + "." + ext["en"] with Path(train_data_en).open("r", encoding="utf-8") as f: sentences = [line.strip().split() for line in f] en_vocab.build_vocab(sentences) # load japanese ja_vocab = Vocab() train_data_ja = train_data_base + "." + ext["ja"] with Path(train_data_ja).open("r", encoding="utf-8") as f: sentences = [line.strip().split() for line in f] ja_vocab.build_vocab(sentences) src = CustomField(vocab=en_vocab, bos_token=None, eos_token=None, lower=True, tokenize=lambda x: x.strip().split(), batch_first=True) tgt = CustomField(vocab=ja_vocab, lower=False, tokenize=lambda x: x.strip().split(), batch_first=True) train_dataloader = datasets.TranslationDataset(path=train_data_base, exts=("." + ext["en"], "." + ext["ja"]), fields=(src, tgt)) val_dataloader = datasets.TranslationDataset(path=val_data_base, exts=("." + ext["en"], "." + ext["ja"]), fields=(src, tgt)) test_dataloader = data.TabularDataset(path=test_data_base + "." + ext["en"], format="tsv", fields=[('text', src)]) return (train_dataloader, val_dataloader, test_dataloader), (en_vocab, ja_vocab)
def get_dataset(dataset): languages = { "antoloji": "tr", "tur": "tr", "cz": "cz", "turkish": "tr", "eng": "en", "tur-lower": "tr", "cz-lower": "cz", "turkish-lower": "tr", "eng-lower": "en" } language = languages[dataset] tokenizer = get_tokenizer(language) def tok(seq): return tokenizer.EncodeAsIds(seq) src = data.Field(tokenize=tok, init_token=1, eos_token=2, pad_token=3, use_vocab=False) tgt = data.Field(tokenize=tok, init_token=1, eos_token=2, pad_token=3, use_vocab=False) mt_train = datasets.TranslationDataset(path='data/{}/{}.train'.format( language, dataset), exts=('.src', '.tgt'), fields=(src, tgt)) mt_dev = datasets.TranslationDataset(path='data/{}/{}.dev'.format( language, dataset), exts=('.src', '.tgt'), fields=(src, tgt)) mt_test = datasets.TranslationDataset(path='data/{}/{}.test'.format( language, dataset), exts=('.src', '.tgt'), fields=(src, tgt)) return mt_train, mt_dev, mt_test
def init_dataloaders(self): batch_size = self.config.hp.batch_size project_path = self.config.firelab.project_path data_path_train = os.path.join(project_path, self.config.data.train) data_path_val = os.path.join(project_path, self.config.data.val) src = data.Field(batch_first=True, init_token='<bos>', eos_token='<eos>',) trg = data.Field(batch_first=True, init_token='<bos>', eos_token='<eos>') mt_train = datasets.TranslationDataset( path=data_path_train, exts=('.en', '.fr'), fields=(src, trg)) mt_val = datasets.TranslationDataset( path=data_path_val, exts=('.en', '.fr'), fields=(src, trg)) src.build_vocab(mt_train.src) trg.build_vocab(mt_train.trg) self.vocab_src = src.vocab self.vocab_trg = trg.vocab self.train_dataloader = data.BucketIterator(mt_train, batch_size, repeat=False) self.val_dataloader = data.BucketIterator(mt_val, batch_size, repeat=False)
def get_dataset(dpath): BOS_WORD = '<s>' EOS_WORD = '</s>' BLANK_WORD = '<blank>' EN = data.Field(pad_token=BLANK_WORD) JA = data.Field(init_token=BOS_WORD, eos_token=EOS_WORD, pad_token=BLANK_WORD) train = datasets.TranslationDataset( path=os.path.join(dpath, 'train'), exts=('.en', '.ja'), fields=(EN, JA)) val = datasets.TranslationDataset( path=os.path.join(dpath, 'dev'), exts=('.en', '.ja'), fields=(EN, JA)) MIN_FREQ = 2 EN.build_vocab(train.src, min_freq=MIN_FREQ) JA.build_vocab(train.trg, min_freq=MIN_FREQ) return train, val, EN, JA
def load_translation(self, data_path, exts, split_ratio=0.95, batch_size=64, dl_save_path=None): print("Loading parallel corpus [{}, {}]".format( data_path + exts[0], data_path + exts[1])) DATA = datasets.TranslationDataset(path=data_path, exts=exts, fields=(('src', self.SRC), ('trg', self.TGT))) print("Successful.") train, valid = DATA.split(split_ratio=split_ratio) print("Building src and tgt vocab ...") self.SRC.build_vocab(train) self.TGT.build_vocab(train) self._add_index() print("Successful.") torch.save(self, dl_save_path, pickle_module=dill) print("The dataloader is save at {}".format(dl_save_path)) train_iter = MyIterator(train, batch_size=batch_size, device=None, repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)), batch_size_fn=batch_size_fn, train=True, shuffle=True) valid_iter = MyIterator(valid, batch_size=batch_size, device=None, repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)), batch_size_fn=batch_size_fn, train=True, shuffle=True) return train_iter, valid_iter
def createVocab(datafile, output, exts=('.de', '.en'), UNK='<unk>', SOS='<s>', EOS='</s>', MIN_FREQ=2, TMAX=100): import spacy spacy_x = spacy.load(exts[0][1:]) spacy_y = spacy.load(exts[1][1:]) def split_x(text): return [tok.text for tok in spacy_x.tokenizer(text)] def split_y(text): return [tok.text for tok in spacy_y.tokenizer(text)] SRC = data.Field(tokenize=split_x, unk_token=UNK, pad_token=UNK) TGT = data.Field(tokenize=split_y, init_token=SOS, eos_token=EOS, unk_token=UNK, pad_token=UNK) ds = datasets.TranslationDataset( path=datafile, exts=exts, fields=(SRC, TGT), filter_pred=lambda x: len(x.src) < TMAX and len(x.trg) < TMAX) SRC.build_vocab(ds.src, min_freq=MIN_FREQ) TGT.build_vocab(ds.trg, min_freq=MIN_FREQ) vocab_src = SRC.vocab.stoi vocab_tgt = TGT.vocab.stoi print('src have length', len(vocab_src)) print('tgt have length', len(vocab_tgt)) save_dict = {'src': vocab_src, 'tgt': vocab_tgt} import pickle with open(output, 'wb') as fs: pickle.dump(save_dict, fs)
def load_wmt_small_dataset(args: argparse.ArgumentParser) -> LoadedDatasetType: src = data.Field( include_lengths=True, init_token='<sos>', eos_token='<eos>', batch_first=True, fix_length=args.torchtext_src_fix_length, ) trg = data.Field( include_lengths=True, init_token='<sos>', eos_token='<eos>', batch_first=True, ) mt_train = datasets.TranslationDataset( path=constants.WMT14_EN_FR_SMALL_TRAIN, exts=('.en', '.fr'), fields=(src, trg)) return mt_train, src, trg
# ---------- prepare dataset ---------- def len_filter(example): return len(example.src) <= opt.max_len and len(example.tgt) <= opt.max_len EN = SentencePieceField(init_token=Constants.BOS_WORD, eos_token=Constants.EOS_WORD, batch_first=True, include_lengths=True, fix_length=opt.max_len + 1) train = datasets.TranslationDataset( path='./data/dualgan/train', exts=('.billion.sp', '.use.sp'), fields=[('src', EN), ('tgt', EN)], filter_pred=len_filter) val = datasets.TranslationDataset( path='./data/dualgan/val', exts=('.billion.sp', '.use.sp'), fields=[('src', EN), ('tgt', EN)], filter_pred=len_filter) train_lang8, val_lang8 = Lang8.splits( exts=('.err.sp', '.cor.sp'), fields=[('src', EN), ('tgt', EN)], train='test', validation='test', test=None, filter_pred=len_filter) # 讀取 vocabulary(確保一致) try: logging.info('Load voab from %s' % opt.load_vocab_from) EN.load_vocab(opt.load_vocab_from) except FileNotFoundError: EN.build_vocab_from(opt.build_vocab_from)
def main(): parser = argparse.ArgumentParser() opt = options.train_options(parser) opt = parser.parse_args() opt.cuda = torch.cuda.is_available() opt.device = None if opt.cuda else -1 # 快速變更設定 opt.exp_dir = './experiment/transformer-reinforce/use_billion' opt.load_vocab_from = './experiment/transformer/lang8-cor2err/vocab.pt' opt.build_vocab_from = './data/billion/billion.30m.model.vocab' opt.load_D_from = opt.exp_dir # opt.load_D_from = None # dataset params opt.max_len = 20 # G params # opt.load_G_a_from = './experiment/transformer/lang8-err2cor/' # opt.load_G_b_from = './experiment/transformer/lang8-cor2err/' opt.d_word_vec = 300 opt.d_model = 300 opt.d_inner_hid = 600 opt.n_head = 6 opt.n_layers = 3 opt.embs_share_weight = False opt.beam_size = 1 opt.max_token_seq_len = opt.max_len + 2 # 包含<BOS>, <EOS> opt.n_warmup_steps = 4000 # D params opt.embed_dim = opt.d_model opt.num_kernel = 100 opt.kernel_sizes = [3, 4, 5, 6, 7] opt.dropout_p = 0.25 # train params opt.batch_size = 1 opt.n_epoch = 10 if not os.path.exists(opt.exp_dir): os.makedirs(opt.exp_dir) logging.basicConfig(filename=opt.exp_dir + '/.log', format=LOG_FORMAT, level=logging.DEBUG) logging.getLogger().addHandler(logging.StreamHandler()) logging.info('Use CUDA? ' + str(opt.cuda)) logging.info(opt) # ---------- prepare dataset ---------- def len_filter(example): return len(example.src) <= opt.max_len and len( example.tgt) <= opt.max_len EN = SentencePieceField(init_token=Constants.BOS_WORD, eos_token=Constants.EOS_WORD, batch_first=True, include_lengths=True) train = datasets.TranslationDataset(path='./data/dualgan/train', exts=('.billion.sp', '.use.sp'), fields=[('src', EN), ('tgt', EN)], filter_pred=len_filter) val = datasets.TranslationDataset(path='./data/dualgan/val', exts=('.billion.sp', '.use.sp'), fields=[('src', EN), ('tgt', EN)], filter_pred=len_filter) train_lang8, val_lang8 = Lang8.splits(exts=('.err.sp', '.cor.sp'), fields=[('src', EN), ('tgt', EN)], train='test', validation='test', test=None, filter_pred=len_filter) # 讀取 vocabulary(確保一致) try: logging.info('Load voab from %s' % opt.load_vocab_from) EN.load_vocab(opt.load_vocab_from) except FileNotFoundError: EN.build_vocab_from(opt.build_vocab_from) EN.save_vocab(opt.load_vocab_from) logging.info('Vocab len: %d' % len(EN.vocab)) # 檢查Constants是否有誤 assert EN.vocab.stoi[Constants.BOS_WORD] == Constants.BOS assert EN.vocab.stoi[Constants.EOS_WORD] == Constants.EOS assert EN.vocab.stoi[Constants.PAD_WORD] == Constants.PAD assert EN.vocab.stoi[Constants.UNK_WORD] == Constants.UNK # ---------- init model ---------- # G = build_G(opt, EN, EN) hidden_size = 512 bidirectional = True encoder = EncoderRNN(len(EN.vocab), opt.max_len, hidden_size, n_layers=1, bidirectional=bidirectional) decoder = DecoderRNN(len(EN.vocab), opt.max_len, hidden_size * 2 if bidirectional else 1, n_layers=1, dropout_p=0.2, use_attention=True, bidirectional=bidirectional, eos_id=Constants.EOS, sos_id=Constants.BOS) G = Seq2seq(encoder, decoder) for param in G.parameters(): param.data.uniform_(-0.08, 0.08) # optim_G = ScheduledOptim(optim.Adam( # G.get_trainable_parameters(), # betas=(0.9, 0.98), eps=1e-09), # opt.d_model, opt.n_warmup_steps) optim_G = optim.Adam(G.parameters(), lr=1e-4, betas=(0.9, 0.98), eps=1e-09) loss_G = NLLLoss(size_average=False) if torch.cuda.is_available(): loss_G.cuda() # # 預先訓練D if opt.load_D_from: D = load_model(opt.load_D_from) else: D = build_D(opt, EN) optim_D = torch.optim.Adam(D.parameters(), lr=1e-4) def get_criterion(vocab_size): ''' With PAD token zero weight ''' weight = torch.ones(vocab_size) weight[Constants.PAD] = 0 return nn.CrossEntropyLoss(weight, size_average=False) crit_G = get_criterion(len(EN.vocab)) crit_D = nn.BCELoss() if opt.cuda: G.cuda() D.cuda() crit_G.cuda() crit_D.cuda() # ---------- train ---------- trainer_D = trainers.DiscriminatorTrainer() if not opt.load_D_from: for epoch in range(1): logging.info('[Pretrain D Epoch %d]' % epoch) pool = helper.DiscriminatorDataPool(opt.max_len, D.min_len, Constants.PAD) # 將資料塞進pool中 train_iter = data.BucketIterator(dataset=train, batch_size=opt.batch_size, device=opt.device, sort_key=lambda x: len(x.src), repeat=False) pool.fill(train_iter) # train D trainer_D.train(D, train_iter=pool.batch_gen(), crit=crit_D, optimizer=optim_D) pool.reset() Checkpoint(model=D, optimizer=optim_D, epoch=0, step=0, input_vocab=EN.vocab, output_vocab=EN.vocab).save(opt.exp_dir) def eval_D(): pool = helper.DiscriminatorDataPool(opt.max_len, D.min_len, Constants.PAD) val_iter = data.BucketIterator(dataset=val, batch_size=opt.batch_size, device=opt.device, sort_key=lambda x: len(x.src), repeat=False) pool.fill(val_iter) trainer_D.evaluate(D, val_iter=pool.batch_gen(), crit=crit_D) # eval_D() # Train G ALPHA = 0 for epoch in range(100): logging.info('[Epoch %d]' % epoch) train_iter = data.BucketIterator(dataset=train, batch_size=1, device=opt.device, sort_within_batch=True, sort_key=lambda x: len(x.src), repeat=False) for step, batch in enumerate(train_iter): src_seq = batch.src[0] src_length = batch.src[1] tgt_seq = src_seq[0].clone() # gold = tgt_seq[:, 1:] optim_G.zero_grad() loss_G.reset() decoder_outputs, decoder_hidden, other = G.rollout(src_seq, None, None, n_rollout=1) for i, step_output in enumerate(decoder_outputs): batch_size = tgt_seq.size(0) # print(step_output) # loss_G.eval_batch(step_output.contiguous().view(batch_size, -1), tgt_seq[:, i + 1]) softmax_output = torch.exp( torch.cat([x for x in decoder_outputs], dim=0)).unsqueeze(0) softmax_output = helper.stack(softmax_output, 8) print(softmax_output) rollout = softmax_output.multinomial(1) print(rollout) tgt_seq = helper.pad_seq(tgt_seq.data, max_len=len(decoder_outputs) + 1, pad_value=Constants.PAD) tgt_seq = autograd.Variable(tgt_seq) for i, step_output in enumerate(decoder_outputs): batch_size = tgt_seq.size(0) loss_G.eval_batch( step_output.contiguous().view(batch_size, -1), tgt_seq[:, i + 1]) G.zero_grad() loss_G.backward() optim_G.step() if step % 100 == 0: pred = torch.cat([x for x in other['sequence']], dim=1) print('[step %d] loss_rest %.4f' % (epoch * len(train_iter) + step, loss_G.get_loss())) print('%s -> %s' % (EN.reverse(tgt_seq.data)[0], EN.reverse(pred.data)[0])) # Reinforce Train G for p in D.parameters(): p.requires_grad = False
from torchtext import datasets, data from translation.data_loader import tokenize_en, tokenize_es if __name__ == "__main__": BOS_WORD = '<s>' EOS_WORD = '</s>' BLANK_WORD = '<pad>' SRC = data.Field(tokenize=tokenize_es, init_token=BOS_WORD, eos_token=EOS_WORD, pad_token=BLANK_WORD) TGT = data.Field(tokenize=tokenize_en, init_token=BOS_WORD, eos_token=EOS_WORD, pad_token=BLANK_WORD) dataset = datasets.TranslationDataset( 'data/en-es/en-es_', ('en.txt', 'es.txt'), (SRC, TGT), filter_pred=lambda x: len(vars(x)['src']) <= 100 and len( vars(x)['trg']) <= 100) SRC.build_vocab(dataset.src, min_freq=2, max_size=39996) TGT.build_vocab(dataset.trg, min_freq=2, max_size=39996) src_file = open("data/SRC_Field.pt", "wb") tgt_file = open("data/TGT_Field.pt", "wb") dill.dump(SRC, src_file) dill.dump(TGT, tgt_file) print("Field files generated!")
def load_data(sum_num=30000, max_length=10): spacy_fr = spacy.load('fr_core_news_sm') spacy_en = spacy.load("en_core_web_sm") tokenize_eng = lambda text: [tok.text for tok in spacy_en.tokenizer(text) ][::-1] #TODO:为什么是反序 tokenize_fren = lambda text: [tok.text for tok in spacy_fr.tokenizer(text)] build_new_data(sum_num=sum_num, max_length=max_length) temp_tokenizer = lambda x: x.strip().split() # eng_field = data.Field(tokenize = tokenize_eng, # init_token = '<sos>', # eos_token = '<eos>', # lower = True) # fren_field = data.Field(tokenize = tokenize_fren, # init_token = '<sos>', # eos_token = '<eos>', # lower = True) # train, val, test = datasets.Multi30k.splits(exts = ('.de', '.en'), # fields = (eng_field, fren_field)) # eng_field.build_vocab(train.src, min_freq=3) # fren_field.build_vocab(train.trg, min_freq=3) # if True: return eng_field, fren_field, (train, val, test) eng_field = data.Field(tokenize=tokenize_eng, init_token=START_WORD, eos_token=END_WORD) fren_field = data.Field(tokenize=tokenize_fren, init_token=START_WORD, eos_token=END_WORD) # eng_field = data.Field(sequential=True, # 序列化数据 # use_vocab=True, # 确认使用词典 # init_token=START_WORD, # eos_token=END_WORD, # fix_length=max_length, # 最大长度 # tokenize=tokenize, # token方法 # unk_token=UNKNOWN_WORD, # 未出现的词 # batch_first=True, #是否先生成批次维度的张量 # include_lengths=True # 返回填充的小批量的元祖和包含每个示例的列表 # ) # fren_field = data.Field(sequential=True, # 序列化数据 # use_vocab=True, # 确认使用词典 # init_token=START_WORD, # eos_token=END_WORD, # fix_length=max_length, # 最大长度 # tokenize=tokenize, # token方法 # unk_token=UNKNOWN_WORD, # batch_first = True, # include_lengths=True # ) dataset = datasets.TranslationDataset(path='./data/small', exts=('.en', '.fr'), fields=(eng_field, fren_field)) train, val, test = dataset.splits(exts=('.en', '.fr'), fields=(eng_field, fren_field), path='./data/') print('len(train.examples)', len(train.examples)) print('len(val.examples)', len(val.examples)) print('len(test.examples)', len(test.examples)) eng_field.build_vocab(train.src, min_freq=2) fren_field.build_vocab(train.trg, min_freq=2) print('len(src_field.vocab)', len(eng_field.vocab)) print('len(trg_field.vocab)', len(fren_field.vocab)) return eng_field, fren_field, (train, val, test)
def main() -> None: parser = get_arg_parser() args = parser.parse_args() device = "cuda" if torch.cuda.is_available() and args.cuda else "cpu" print('using device {}'.format(device)) print('loading datasets...') src = data.Field(include_lengths=True, init_token='<sos>', eos_token='<eos>', batch_first=True, fix_length=200) trg = data.Field(include_lengths=True, init_token='<sos>', eos_token='<eos>', batch_first=True) if args.dataset == 'WMT': mt_train = datasets.TranslationDataset( path=constants.WMT14_EN_FR_SMALL_TRAIN, exts=('.en', '.fr'), fields=(src, trg)) src_vocab, trg_vocab = utils.load_torchtext_wmt_small_vocab() src.vocab = src_vocab trg.vocab = trg_vocab mt_valid = None else: if args.dataset == 'Multi30k': mt_train, mt_valid, mt_test = datasets.Multi30k.splits( exts=('.en', '.de'), fields=(src, trg), ) elif args.dataset == 'IWSLT': mt_train, mt_valid, mt_test = datasets.IWSLT.splits( exts=('.en', '.de'), fields=(src, trg), ) else: raise Exception("Uknown dataset: {}".format(args.dataset)) print('loading vocabulary...') # mt_dev shares the fields, so it shares their vocab objects src.build_vocab( mt_train, min_freq=args.torchtext_unk, max_size=args.torchtext_src_max_vocab, ) trg.build_vocab( mt_train, max_size=args.torchtext_trg_max_vocab, ) print('loaded vocabulary') # determine the correct dataset to evaluate eval_dataset = mt_train if args.eval_train else mt_valid eval_dataset = mt_test if args.eval_test else eval_dataset train_loader = data.BucketIterator( dataset=eval_dataset, batch_size=1, sort_key=lambda x: len( x.src), # data.interleave_keys(len(x.src), len(x.trg)), sort_within_batch=True, device=device) print('model type: {}'.format(args.model_type)) model = utils.build_model(parser, src.vocab, trg.vocab) if args.load_path is not None: model.load_state_dict(torch.load(args.load_path)) model = model.eval() if args.binarize: print('binarizing model') binarized_model = Binarize(model) binarized_model.binarization() print(model) model_size = size_metrics.get_model_size(model) print("64 bit float: {}".format( size_metrics.get_model_size(model, 64, args.binarize))) print("32 bit float: {}".format( size_metrics.get_model_size(model, 32, args.binarize))) print("16 bit float: {}".format( size_metrics.get_model_size(model, 16, args.binarize)))
def main(): src_dir = "data/src" model_dir = "data/model" eval_dir = "data/eval" corpus = "lang8_small" en_emb = "glove" de_emb = "glove" seq_train = False emb_dim = 200 batch_size = 1500 # Data Loading vocab_file = os.path.join(model_dir, "%s.vocab" % (corpus)) model_file = os.path.join( model_dir, "%s.%s.%s.transformer.pt" % (corpus, en_emb, de_emb)) if not os.path.exists(eval_dir): os.makedirs(eval_dir) # Computing Unit device = torch.device("cpu") # Loading Data bos_word = '<s>' eos_word = '</s>' blank_word = '<blank>' min_freq = 2 spacy_en = spacy.load('en') def tokenize(text): return [tkn.text for tkn in spacy_en.tokenizer(text)] TEXT = data.Field(tokenize=tokenize, init_token=bos_word, eos_token=eos_word, pad_token=blank_word) test = datasets.TranslationDataset(path=os.path.join(src_dir, corpus), exts=('.test.src', '.test.trg'), fields=(TEXT, TEXT)) # use the same order as original data test_iter = data.Iterator(test, batch_size=batch_size, device=device, sort=False, repeat=False, train=False) random_idx = random.randint(0, len(test) - 1) print(test[random_idx].src) print(test[random_idx].trg) # Vocabulary TEXT.vocab = torch.load(vocab_file) pad_idx = TEXT.vocab.stoi["<blank>"] print("Load %s vocabuary; vocab size = %d" % (corpus, len(TEXT.vocab))) # Word Embedding encoder_emb, decoder_emb = get_emb(en_emb, de_emb, TEXT.vocab, device, d_model=emb_dim) # Translation model = BuildModel(len(TEXT.vocab), encoder_emb, decoder_emb, d_model=emb_dim).to(device) model.load_state_dict(torch.load(model_file)) model.eval() print("Predicting %s ..." % (corpus)) src, trg, pred = [], [], [] for batch in (rebatch(pad_idx, b) for b in test_iter): out = greedy_decode(model, TEXT.vocab, batch.src, batch.src_mask) # print("SRC OUT: ", src.shape, out.shape) probs = model.generator(out) _, prediction = torch.max(probs, dim=-1) source = [[TEXT.vocab.itos[word] for word in words[1:]] for words in batch.src] target = [[TEXT.vocab.itos[word] for word in words[1:]] for words in batch.trg] translation = [[TEXT.vocab.itos[word] for word in words] for words in prediction] for i in range(len(translation)): src.append(' '.join(source[i]).split('</s>')[0]) trg.append(' '.join(target[i]).split('</s>')[0]) pred.append(' '.join(translation[i]).split('</s>')[0]) # eliminate data with unkonwn words in src trg if '<unk>' in src[-1] or '<unk>' in trg[-1]: continue print("Source:", src[-1]) print("Target:", trg[-1]) print("Translation:", pred[-1]) print() prefix = os.path.join(eval_dir, '%s.%s.%s.eval' % (corpus, en_emb, de_emb)) for sentences, ext in zip([src, trg, pred], ['.src', '.trg', '.pred']): with open(prefix + ext, 'w+') as f: f.write('\n'.join(sentences))
resource = torch.load('data/tfm-40768.pt') model_dict, field = resource['model'], resource['field'] vocab_size = len(field.vocab.stoi) pad_index = field.vocab.stoi['<pad>'] model = ParallelTransformer( module=Transformer(vocab_size).to(device), device_ids=device_ids, output_device=device, dim=1 ) model.load_state_dict(model_dict) test = datasets.TranslationDataset( path='data/bpe.test', exts=('.tgl', '.en'), fields=(('src', field), ('trg', field)) ) test_iter = data.BucketIterator( test, batch_size=256, batch_size_fn=lambda ex, bs, sz: sz + len(ex.src), device=device, train=False ) result = run_epoch(test_iter, model, field, device)
with open(file_name, "w") as file: for line in hypotheses: file.write(line + "\n") bleu = sacrebleu.raw_corpus_bleu(hypotheses, [references], .01).score print(bleu) for error in range(1, 2): my_data = {} num_batches = 100 error_per = error / 10. for split in ["train", "val", "test"]: my_data[split] = datasets.TranslationDataset(path="data/new_" + split, exts=('.nl', '.amr'), fields=(SRC, TRG)) MIN_FREQ = 5 SRC.build_vocab(my_data["train"].src, min_freq=MIN_FREQ) TRG.build_vocab(my_data["train"].trg, min_freq=MIN_FREQ) PAD_INDEX = TRG.vocab.stoi[PAD_TOKEN] print_data_info(my_data, SRC, TRG) train_iter = data.BucketIterator(my_data["train"], batch_size=100, train=True, sort_within_batch=True, sort_key=lambda x: (len(x.src), len(x.trg)), repeat=False,
) #unk=0, pad=1 tgt = data.Field(sequential=True, use_vocab=True, pad_token=PAD, tokenize=tokenizer_de, lower=True, init_token=BOS, eos_token=EOS, include_lengths=True, ) #unk=0, pad=1, <s>=2, </s>=3 prefix_f = './escape.en-de.tok.100k' parallel_dataset = datasets.TranslationDataset(path=prefix_f, exts=('.en', '.de'), fields=[('src', src), ('tgt', tgt)]) src.build_vocab(parallel_dataset, min_freq=5, max_size=15000) tgt.build_vocab(parallel_dataset, min_freq=5, max_size=15000) train, valid = parallel_dataset.split(split_ratio=0.97) train_iter, valid_iter = data.BucketIterator.splits((train, valid), batch_size=32, sort_key=lambda x: data.interleave_keys(len(x.src), len(x.trg)), device='cuda') class Encoder(nn.Module): def __init__(self, hidden_dim: int, dropout: float, pad_idx: int): super().__init__() self.dim = hidden_dim
import torch from torchtext import data, datasets, vocab import os dataDir = '/Users/xinyi.ye/Documents/machine_translate/experiments/train4/' BOS_WORD = '<s>' EOS_WORD = '</s>' BLANK_WORD = "<blank>" # Field define how to deal with raw data SRC = data.Field(pad_token=BLANK_WORD) # tokenize, default: string.split TGT = data.Field(init_token=BOS_WORD, eos_token=EOS_WORD, pad_token=BLANK_WORD) traindataset = datasets.TranslationDataset(path=dataDir + 'train-infoq', exts=('.en', '.zh'), fields=(SRC, TGT)) pwd = os.getcwd() SRC.build_vocab(traindataset, vectors=vocab.Vectors(name='cc.en.300.vec', cache=pwd + '/.vector_cache')) TGT.build_vocab(traindataset, vectors=vocab.Vectors(name='cc.zh.300.vec', cache=pwd + '/.vector_cache'))
return [tok.text for tok in spacy_en.tokenizer(text)] BOS_WORD = '<s>' EOS_WORD = '</s>' BLANK_WORD = "<blank>" SRC = data.Field(tokenize=tokenize_de, pad_token=BLANK_WORD) TGT = data.Field(tokenize=tokenize_en, init_token=BOS_WORD, eos_token=EOS_WORD, pad_token=BLANK_WORD) MAX_LEN = 40 dataset = datasets.TranslationDataset( path='WMT14/europarl-v7', exts=('.de', '.en'), fields=(SRC, TGT), filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and len( vars(x)['trg']) <= MAX_LEN) MIN_SRC_FREQ = 9 # 出现频率小于这个频率的丢掉 ,这个值设置的太小会导致字典尺寸太大,从而导致embed失败 MIN_TGT_FREQ = 3 SRC.build_vocab(dataset.src, min_freq=MIN_SRC_FREQ) TGT.build_vocab(dataset.trg, min_freq=MIN_TGT_FREQ) len1 = SRC.vocab.__len__() len2 = TGT.vocab.__len__() print('build_vocab is successful') ''' # 传入数据!!!!!!!!!!!!!!!!!!!!!!!!!传入 import spacy from torchtext import datasets, data spacy_de = spacy.load('de') # nlp =spacy.load('de_core_news_sm')
def main(): args = parse_args() SRC_DIR = args.SRC_DIR MODEL_DIR = args.MODEL_DIR DATA = args.DATA EN_EMB = args.EN_EMB DE_EMB = args.DE_EMB SEQ_TRAIN = True if DE_EMB == 'elmo' else False # TODO currently hidden size is fixed, should be able to adjust # based on src and trg embeddings respectively # EMB_DIM should be multiple of h (default 8), look at MultiHeadedAttention if 'glove' in EN_EMB: EMB_DIM = 200 elif 'elmo' in EN_EMB: EMB_DIM = 1024 else: EMB_DIM = 512 BATCH_SIZE = 1500 EPOCHES = 100 options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json" weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5" vocab_file = os.path.join(MODEL_DIR, '%s.vocab' % (DATA)) model_file = os.path.join( MODEL_DIR, '%s.%s.%s.transformer.pt' % (DATA, EN_EMB, DE_EMB)) if not os.path.exists(MODEL_DIR): os.makedirs(MODEL_DIR) # GPU to use device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # device = ("cpu") # devices = [0, 1, 2, 3] ##################### # Data Loading # ##################### BOS_WORD = '<s>' EOS_WORD = '</s>' BLANK_WORD = "<blank>" MIN_FREQ = 2 spacy_en = spacy.load('en') def tokenize_en(text): return [tok.text for tok in spacy_en.tokenizer(text)] TEXT = data.Field(tokenize=tokenize_en, init_token=BOS_WORD, eos_token=EOS_WORD, pad_token=BLANK_WORD) train = datasets.TranslationDataset(path=os.path.join(SRC_DIR, DATA), exts=('.train.src', '.train.trg'), fields=(TEXT, TEXT)) val = datasets.TranslationDataset(path=os.path.join(SRC_DIR, DATA), exts=('.val.src', '.val.trg'), fields=(TEXT, TEXT)) train_iter = MyIterator(train, batch_size=BATCH_SIZE, device=device, repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)), batch_size_fn=batch_size_fn, train=True) valid_iter = MyIterator(val, batch_size=BATCH_SIZE, device=device, repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)), batch_size_fn=batch_size_fn, train=False) random_idx = random.randint(0, len(train) - 1) print(train[random_idx].src) print(train[random_idx].trg) ############### # Vocabuary # ############### if os.path.exists(vocab_file): TEXT.vocab = torch.load(vocab_file) else: print("Save %s vocabuary..." % (DATA), end='\t') TEXT.build_vocab(train.src, min_freq=MIN_FREQ, vectors='glove.6B.200d') print("vocab size = %d" % (len(TEXT.vocab))) torch.save(TEXT.vocab, vocab_file) pad_idx = TEXT.vocab.stoi["<blank>"] ##################### # Word Embedding # ##################### encoder_emb, decoder_emb = get_emb(EN_EMB, DE_EMB, TEXT.vocab, device, d_model=EMB_DIM, elmo_options=options_file, elmo_weights=weight_file) ########################## # Training the System # ########################## model = make_model(len(TEXT.vocab), encoder_emb, decoder_emb, d_model=EMB_DIM).to(device) if os.path.exists(model_file): print("Restart from last checkpoint...") model.load_state_dict(torch.load(model_file)) criterion = LabelSmoothing(size=len(TEXT.vocab), padding_idx=pad_idx, smoothing=0.1).to(device) model_opt = NoamOpt( EMB_DIM, 1, 2000, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)) # calculate parameters total_params = sum(p.numel() for p in model.parameters()) // 1000000 trainable_params = sum( p.numel() for p in model.parameters() if p.requires_grad) // 1000000 rate = trainable_params / total_params print("Model parameters trainable (%d M) / total (%d M) = %f" % (trainable_params, total_params, rate)) print("Training %s %s %s..." % (DATA, EN_EMB, DE_EMB)) ### SINGLE GPU for epoch in range(EPOCHES): model.train() loss_compute = SimpleLossCompute(model.generator, criterion, opt=model_opt) run_epoch((rebatch(pad_idx, b) for b in train_iter), model, loss_compute, TEXT.vocab, seq_train=SEQ_TRAIN) model.eval() total_loss, total_tokens = 0, 0 for batch in (rebatch(pad_idx, b) for b in valid_iter): out = greedy_decode(model, TEXT.vocab, batch.src, batch.src_mask, trg=batch.trg) loss = loss_compute(out, batch.trg_y, batch.ntokens) total_loss += loss total_tokens += batch.ntokens print("Save model...") torch.save(model.state_dict(), model_file) print("Epoch %d/%d - Loss: %f" % (epoch + 1, EPOCHES, total_loss / total_tokens))
def main(): args_parser = argparse.ArgumentParser( description='Tuning with graph-based parsing') args_parser.add_argument('--cuda', action='store_true', help='using GPU') args_parser.add_argument('--num_epochs', type=int, default=200, help='Number of training epochs') args_parser.add_argument('--batch_size', type=int, default=64, help='Number of sentences in each batch') args_parser.add_argument('--hidden_size', type=int, default=256, help='Number of hidden units in RNN') args_parser.add_argument('--num_layers', type=int, default=1, help='Number of layers of RNN') args_parser.add_argument('--opt', choices=['adam', 'sgd', 'adamax'], help='optimization algorithm') args_parser.add_argument('--objective', choices=['cross_entropy', 'crf'], default='cross_entropy', help='objective function of training procedure.') args_parser.add_argument('--learning_rate', type=float, default=0.01, help='Learning rate') args_parser.add_argument('--decay_rate', type=float, default=0.05, help='Decay rate of learning rate') args_parser.add_argument('--clip', type=float, default=5.0, help='gradient clipping') args_parser.add_argument('--gamma', type=float, default=0.0, help='weight for regularization') args_parser.add_argument('--epsilon', type=float, default=1e-8, help='epsilon for adam or adamax') args_parser.add_argument('--p_rnn', nargs=2, type=float, default=0.1, help='dropout rate for RNN') args_parser.add_argument('--p_in', type=float, default=0.33, help='dropout rate for input embeddings') args_parser.add_argument('--p_out', type=float, default=0.33, help='dropout rate for output layer') args_parser.add_argument('--schedule', type=int, help='schedule for learning rate decay') args_parser.add_argument( '--unk_replace', type=float, default=0., help='The rate to replace a singleton word with UNK') #args_parser.add_argument('--punctuation', nargs='+', type=str, help='List of punctuations') args_parser.add_argument('--word_path', help='path for word embedding dict') args_parser.add_argument( '--freeze', action='store_true', help='frozen the word embedding (disable fine-tuning).') # args_parser.add_argument('--char_path', help='path for character embedding dict') args_parser.add_argument( '--train') # "data/POS-penn/wsj/split1/wsj1.train.original" args_parser.add_argument( '--dev') # "data/POS-penn/wsj/split1/wsj1.dev.original" args_parser.add_argument( '--test') # "data/POS-penn/wsj/split1/wsj1.test.original" args_parser.add_argument('--model_path', help='path for saving model file.', default='models/temp') args_parser.add_argument('--model_name', help='name for saving model file.', default='generator') args_parser.add_argument('--seq2seq_save_path', default='checkpoints/seq2seq_save_model', type=str, help='seq2seq_save_path') args_parser.add_argument('--seq2seq_load_path', default='checkpoints/seq2seq_save_model', type=str, help='seq2seq_load_path') # args_parser.add_argument('--rl_finetune_seq2seq_save_path', default='models/rl_finetune/seq2seq_save_model', # type=str, help='rl_finetune_seq2seq_save_path') # args_parser.add_argument('--rl_finetune_network_save_path', default='models/rl_finetune/network_save_model', # type=str, help='rl_finetune_network_save_path') # args_parser.add_argument('--rl_finetune_seq2seq_load_path', default='models/rl_finetune/seq2seq_save_model', # type=str, help='rl_finetune_seq2seq_load_path') # args_parser.add_argument('--rl_finetune_network_load_path', default='models/rl_finetune/network_save_model', # type=str, help='rl_finetune_network_load_path') args_parser.add_argument('--direct_eval', action='store_true', help='direct eval without generation process') args = args_parser.parse_args() spacy_en = spacy.load('en_core_web_sm') # python -m spacy download en spacy_de = spacy.load('de_core_news_sm') # python -m spacy download en spacy_fr = spacy.load('fr_core_news_sm') # python -m spacy download en SEED = 0 random.seed(SEED) np.random.seed(SEED) torch.manual_seed(SEED) torch.cuda.manual_seed(SEED) device = torch.device( 'cuda:2' ) #torch.device('cuda' if torch.cuda.is_available() else 'cpu') #'cpu' if not torch.cuda.is_available() else 'cuda:0' def tokenizer_en(text): # create a tokenizer function return [tok.text for tok in spacy_en.tokenizer(text)] def tokenizer_de(text): # create a tokenizer function return [tok.text for tok in spacy_de.tokenizer(text)] def tokenizer_fr(text): # create a tokenizer function return [tok.text for tok in spacy_fr.tokenizer(text)] en_field = data.Field(sequential=True, tokenize=tokenizer_en, lower=True, fix_length=150, include_lengths=True, batch_first=True) #use_vocab=False de_field = data.Field(sequential=True, tokenize=tokenizer_de, lower=True, fix_length=150, include_lengths=True, batch_first=True) #use_vocab=False fr_field = data.Field(sequential=True, tokenize=tokenizer_fr, lower=True, fix_length=150, include_lengths=True, batch_first=True) #use_vocab=False print('begin loading training data-----') print('time: ', time.asctime(time.localtime(time.time()))) seq2seq_train_data = MultiSourceTranslationDataset( path='wmt14_3/sample', exts=('.de', '.fr', '.en'), fields=(de_field, fr_field, en_field)) print('begin loading validation data-----') print('time: ', time.asctime(time.localtime(time.time()))) seq2seq_dev_data = MultiSourceTranslationDataset( path='wmt14_3/sample', exts=('.de', '.fr', '.en'), fields=(de_field, fr_field, en_field)) print('end loading data-----') print('time: ', time.asctime(time.localtime(time.time()))) fr_train_data = datasets.TranslationDataset(path='wmt14_3/train', exts=('.fr', '.fr'), fields=(fr_field, fr_field)) print('end fr data add-----') print('time: ', time.asctime(time.localtime(time.time()))) fr_field.build_vocab(fr_train_data, max_size=80000) # ,vectors="glove.6B.100d" with open('vocab_fr.pickle', 'wb') as f: pickle.dump(fr_field.vocab, f) print('end fr vocab save-----') print('time: ', time.asctime(time.localtime(time.time()))) with open('vocab_fr.pickle', 'rb') as f: fr_field.vocab = pickle.load(f) print('end fr vocab load-----') print('time: ', time.asctime(time.localtime(time.time())))
def main(): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--embedding-size', type=int, dest="embedding_size", help="Embedding size", default=EMBEDDING_DIM) parser.add_argument('--hidden-size', type=int, dest="hidden_size", help="Hidden size", default=HIDDEN_SIZE) parser.add_argument('--nlayers', type=int, dest="nlayers", help="Number of RNN layers", default=NUM_LAYER) parser.add_argument('--dropout', type=float, help="Dropout", default=DROPOUT) parser.add_argument('-b', '--batch-size', type=int, dest="batch_size", help="Batch size", default=DEFAULT_BATCH_SIZE) parser.add_argument('--learning-rate', type=float, dest="learning_rate", help="Initial learning rate", default=0.1) parser.add_argument('--learning-rate-decay', type=float, dest="learning_rate_decay", help="Learning rate decay", default=0.5) parser.add_argument( '--epochs', type=int, default=10, help="Start decaying every epoch after and including this epoch.") parser.add_argument( '--start-decay-at', dest="start_decay_at", type=int, default=3, help="Start decaying every epoch after and including this epoch.") parser.add_argument('--batches-per-print', type=int, dest="batches_per_print", help="Number of batches per print", default=100) parser.add_argument('-m', '--model', help="Path to the model file to load", default=None) parser.add_argument('--data', help="train or test", default="train") cmd_args = parser.parse_args() src = data.Field(include_lengths=True, tokenize=list) tgt = data.Field(include_lengths=True, tokenize=list) mt_train = datasets.TranslationDataset(path='data/%s' % cmd_args.data, exts=('.src', '.tgt'), fields=(src, tgt)) mt_dev = datasets.TranslationDataset(path='data/dev', exts=('.src', '.tgt'), fields=(src, tgt)) print("Building vocabularies..") src.build_vocab(mt_train) tgt.build_vocab(mt_train) print("Making batches..") # sort key 부호는 GPU에서는 -, CPU에서는 +를 붙여야 하나? SIGN = -1 if CUDA_AVAILABLE else 1 train_iter = data.BucketIterator(dataset=mt_train, batch_size=cmd_args.batch_size, device=(None if CUDA_AVAILABLE else -1), repeat=False, sort_key=lambda x: len(x.src) * SIGN) dev_iter = data.BucketIterator(dataset=mt_dev, batch_size=cmd_args.batch_size, device=(None if CUDA_AVAILABLE else -1), repeat=False, train=False, sort_key=lambda x: len(x.src) * SIGN) print("Creating model..") from spacer import Spacer num_classes = len(tgt.vocab) padding_idx = tgt.vocab.stoi["<pad>"] model = Spacer(len(src.vocab), num_classes, cmd_args.embedding_size, cmd_args.hidden_size, cmd_args.nlayers, cmd_args.dropout, BIDIRECTIONAL, padding_idx=padding_idx) if CUDA_AVAILABLE: model.cuda(0) if cmd_args.model: print("Loading model: {}".format(cmd_args.model)) state_dict = torch.load(cmd_args.model) model.load_state_dict(state_dict) criterion = torch.nn.CrossEntropyLoss(ignore_index=padding_idx) learning_rate = cmd_args.learning_rate loss_history = [] for epoch in range(1, cmd_args.epochs + 1): if epoch >= cmd_args.start_decay_at and len( loss_history) > 1 and loss_history[-1] > loss_history[-2]: learning_rate *= cmd_args.learning_rate_decay optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) train_losses = [] correct_answer_count = 0 total_question_count = 0 total_processing_chars = 0 start_time = time.time() for batch_idx, batch in enumerate(train_iter, 1): optimizer.zero_grad() inputs, src_length = batch.src y_ = model(inputs, src_length) y_ = y_.view(-1, num_classes) y = batch.trg[0] y = y.view(-1) loss = criterion(y_, y) loss.backward() optimizer.step() train_losses.append(loss.data[0]) _, prediction = torch.max(y_, dim=1) total_question_count += prediction.size()[0] correct_answer_count += (prediction == y).float().sum().data[0] total_processing_chars += torch.sum(src_length) if batch_idx % cmd_args.batches_per_print == 0: average_loss = np.mean(train_losses) end_time = time.time() cps = int(total_processing_chars / (end_time - start_time)) print( "{}-{}(BS: {}), TrainLoss: {:.4f}, Accuracy: {:.4f}, LR:{:.4f}, Time: {:.2f} s, Speed: {} chars/s" .format(epoch, batch_idx, cmd_args.batch_size, average_loss, correct_answer_count / total_question_count, learning_rate, end_time - start_time, cps)) print("Sentence: {}".format("".join( src.vocab.itos[x[0]] for x in batch.src[0].data))) prediction = prediction.view(-1, batch.batch_size) print("Prediction: {}".format("".join( tgt.vocab.itos[x[0]] for x in prediction.data))) y = y.view(-1, batch.batch_size) print("Answer : {}".format("".join(tgt.vocab.itos[x[0]] for x in y.data))) train_losses = [] correct_answer_count = 0 total_question_count = 0 total_processing_chars = 0 start_time = end_time model.train(False) cv_losses = [] for cv_batch in dev_iter: inputs, src_length = cv_batch.src y_ = model(inputs, src_length) y_ = y_.view(-1, num_classes) y = cv_batch.trg[0] y = y.view(-1) loss = criterion(y_, y) cv_losses.append(loss.data[0]) cv_average_loss = np.mean(cv_losses) loss_history.append(cv_average_loss) model.train(True) filename = "models/spacer_{:02d}_{:.4f}.pth".format( epoch, cv_average_loss) print("Saving a file: {}".format(filename)) torch.save(model.state_dict(), filename) print("== Summary ==") for i, l in enumerate(loss_history, start=1): print("Epoch: {}, CV Loss: {}".format(i, l)) print("") print('done')
print("WARNING: You have a CUDA device, so you should probably run with --cuda") else: torch.cuda.manual_seed(args.seed) ############################ # Load data ############################ print ("Loading data...") PAD_WORD = '<blank>' eval_batch_size = args.eval_batch_size src = data.Field(pad_token=PAD_WORD) trg = data.Field(pad_token=PAD_WORD) train_data = datasets.TranslationDataset(path=args.data + '/train', exts=('.en', '.de'), fields=(src, trg)) val_data = datasets.TranslationDataset(path=args.data + '/valid', exts=('.en', '.de'), fields=(src, trg)) test_data = datasets.TranslationDataset(path=args.data + '/test', exts=('.en', '.de'), fields=(src, trg)) print ("DONE\n") ############################ # Load vocab ############################ print ("Loading vocab...") vocab = dict(torch.load(args.dict_path, "text")) v = vocab['tgt'] v.stoi = defaultdict(lambda: 0, v.stoi) src.vocab = v; trg.vocab = v