def run(args): writer = SummaryWriter() src, tgt, _, _ = build_dataset(args) print('Loading test data split.') _, _, test_gen = datasets.Multi30k.splits( exts=(build_file_extension(args.src_language), build_file_extension(args.tgt_language)), fields=(('src', src), ('tgt', tgt)), filter_pred=lambda x: len(vars(x)['src']) <= args.max_seq_length and len(vars(x)['tgt']) <= args.max_seq_length) print('Finished loading test data split.') src_vocab_size = len(src.vocab.itos) tgt_vocab_size = len(tgt.vocab.itos) _, _, test_iterator = data.Iterator.splits( (_, _, test_gen), sort_key=lambda x: len(x.src), batch_sizes=(args.batch_size, args.batch_size, args.batch_size)) print('Instantiating model...') device = args.device model = Transformer(src_vocab_size, tgt_vocab_size, device, p_dropout=args.dropout) model = model.to(device) model.load_state_dict(torch.load(args.model)) print('Model instantiated!') print('Starting testing...') test(model, test_iterator, src.vocab, tgt.vocab, args, writer) print('Finished testing.')
def __init__(self, vocabulary_size_in, vocabulary_size_out, constants, hyperparams): super(Translator, self).__init__() self.Transformer = Transformer(vocabulary_size_in, vocabulary_size_out, constants, hyperparams) self.criterion = nn.CrossEntropyLoss() self.optimizer = optim.Adam(self.Transformer.parameters(), betas=(0.9, 0.98), eps=1e-9) self.scheduler = Scheduler(d_model=hyperparams.D_MODEL, warmup_steps=hyperparams.WARMUP_STEPS) self.constants = constants self.hyperparams = hyperparams
def __init__(self, src_vocab, tgt_vocab, src_vocab_size, tgt_vocab_size, args): self.max_seq_length = args.max_seq_length self.device = args.device self.src_vocab = src_vocab self.tgt_vocab = tgt_vocab self.beam_size = args.beam_size model = Transformer(src_vocab_size, tgt_vocab_size, args.device) model.load_state_dict(torch.load(args.model)) model = model.to(args.device) self.model = model self.model.eval()
def do_predict(): train_iterator, valid_iterator, test_iterator, SRC, TGT = prepare_data_multi30k() src_pad_idx = SRC.vocab.stoi[SRC.pad_token] tgt_pad_idx = TGT.vocab.stoi[TGT.pad_token] src_vocab_size = len(SRC.vocab) tgt_vocab_size = len(TGT.vocab) model = Transformer(n_src_vocab=src_vocab_size, n_trg_vocab=tgt_vocab_size, src_pad_idx=src_pad_idx, trg_pad_idx=tgt_pad_idx, d_word_vec=256, d_model=256, d_inner=512, n_layer=3, n_head=8, dropout=0.1, n_position=200) model.cuda() model_dir = "./checkpoint/transformer" model_path = os.path.join(model_dir, "model_9.pt") state_dict = torch.load(model_path) model.load_state_dict(state_dict) model.eval() pre_sents = [] gth_sents = [] for idx, batch in enumerate(test_iterator): if idx % 10 == 0: print("[TIME] --- time: {} --- [TIME]".format(time.ctime(time.time()))) # src_seq: [seq_len, batch_size] # tgt_seq: [seq_len, batch_size] src_seq, src_len = batch.src tgt_seq, tgt_len = batch.trg batch_size = src_seq.size(0) pre_tokens = [] with torch.no_grad(): for idx in range(batch_size): tokens = translate_tokens(src_seq[idx], SRC, TGT, model, max_len=32) pre_tokens.append(tokens) # tgt: [batch_size, seq_len] gth_tokens = tgt_seq.cpu().detach().numpy().tolist() for tokens, gth_ids in zip(pre_tokens, gth_tokens): gth = [TGT.vocab.itos[idx] for idx in gth_ids] pre_sents.append(" ".join(tokens)) gth_sents.append(" ".join(gth)) pre_path = os.path.join(model_dir, "pre.json") gth_path = os.path.join(model_dir, "gth.json") with open(pre_path, "w", encoding="utf-8") as writer: json.dump(pre_sents, writer, ensure_ascii=False, indent=4) with open(gth_path, "w", encoding="utf-8") as writer: json.dump(gth_sents, writer, ensure_ascii=False, indent=4)
def main(): pprint(arg) # load dataset train_loader, valid_loader, test_loader = prepare_dataloaders(arg) print("Data loaded. Instances: {} train / {} dev / {} test".format( len(train_loader), len(valid_loader), len(test_loader))) # prepare model device = torch.device('cuda' if arg["cuda"] == True else 'cpu') #print(len(train_loader.dataset.w2i)) # nice, we can index internal propertied of CNNDMDataset from the loader! print() transformer_network = Transformer( len(train_loader.dataset.w2i), # src_vocab_size, len(train_loader.dataset.w2i), # tgt_vocab_size, is equal to src size train_loader.dataset.conf[ "max_sequence_len"], # max_token_seq_len, from the preprocess config tgt_emb_prj_weight_sharing=True, # opt.proj_share_weight, emb_src_tgt_weight_sharing=True, #opt.embs_share_weight, d_k=arg["d_k"], d_v=arg["d_v"], d_model=arg["d_model"], d_word_vec=arg["d_model"], # d_word_vec, d_inner=arg["d_inner_hid"], n_layers=arg["n_layers"], n_head=arg["n_head"], dropout=arg["dropout"]).to(device) print("Transformer model initialized.") print() # train model optimizer = transformer.optimizers.ScheduledOptim( optim.Adam( filter(lambda x: x.requires_grad, transformer_network.parameters() ), # apply only on parameters that require_grad betas=(0.9, 0.98), eps=1e-09), arg["d_model"], arg["n_warmup_steps"]) train(transformer_network, train_loader, valid_loader, test_loader, optimizer, device, arg)
def __init__(self, iterator, params, mode): """Initialize model, build graph. Args: params: parameters. mode: train | eval | predict mode defined with tf.estimator.ModeKeys. """ # Build graph. tf.logging.info("Initializing model, building graph...") # Predict single product embedding. if mode == tf.estimator.ModeKeys.PREDICT: self.encode = Transformer(params, False)(iterator.src) else: logits = Transformer(params, True)(iterator.src, iterator.tgt) with tf.name_scope("loss"): self.loss = contrastive_loss(iterator.label, logits) with tf.name_scope("accuracy"): self.accuracy = compute_accuracy(iterator.label, logits) self.model_stats()
def test_bert_trans(): if args.bert is True: sample_transformer = TransformerBert(num_layers=2, d_model=512, num_heads=8, dff=2048, input_vocab_size=8500, target_vocab_size=8000, model_dir=args.bert_model_dir, pe_input=10000, pe_target=6000) else: sample_transformer = Transformer(num_layers=2, d_model=512, num_heads=8, dff=2048, input_vocab_size=8500, target_vocab_size=8000, pe_input=10000, pe_target=6000) temp_input = tf.random.uniform((64, 38), dtype=tf.int64, minval=0, maxval=200) temp_seg = tf.ones((64, 38), dtype=tf.int64) temp_target = tf.random.uniform((64, 36), dtype=tf.int64, minval=0, maxval=200) enc_padding_mask, combined_mask, dec_padding_mask = create_masks( temp_input, temp_target) if args.bert is True: fn_out, _ = sample_transformer(temp_input, temp_seg, temp_target, training=True, enc_padding_mask=enc_padding_mask, look_ahead_mask=combined_mask, dec_padding_mask=dec_padding_mask) else: fn_out, _ = sample_transformer(temp_input, temp_target, training=False, enc_padding_mask=None, look_ahead_mask=None, dec_padding_mask=None) tf.compat.v1.logging.info( fn_out.shape) # (batch_size, tar_seq_len, target_vocab_size)
def run(args): writer = SummaryWriter() src, tgt, train_iterator, val_iterator = build_dataset(args) src_vocab_size = len(src.vocab.itos) tgt_vocab_size = len(tgt.vocab.itos) print('Instantiating model...') device = args.device model = Transformer(src_vocab_size, tgt_vocab_size, device, p_dropout=args.dropout) model = model.to(device) if args.checkpoint is not None: model.load_state_dict(torch.load(args.checkpoint)) else: for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) print('Model instantiated!') optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.98), eps=1e-9) print('Starting training...') for epoch in range(args.epochs): acc = train(model, epoch + 1, train_iterator, optimizer, src.vocab, tgt.vocab, args, writer) model_file = 'models/model_' + str(epoch) + '_' + str(acc) + '.pth' torch.save(model.state_dict(), model_file) print('Saved model to ' + model_file) validate(model, epoch + 1, val_iterator, src.vocab, tgt.vocab, args, writer) print('Finished training.')
def __init__(self, tokenizer: Tokenizer, maximum_position_encoding=1000, num_layers=6, d_model=512, num_heads=8, dff=2048, dropout_rate=0.1): self.tokenizer = tokenizer self.d_model = d_model vocab_size = tokenizer.get_num_tokens() self.learning_rate = CustomSchedule(d_model) self.optimizer = tf.keras.optimizers.Adam(self.learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) self.loss_object = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction='none') self.train_loss = tf.keras.metrics.Mean(name='train_loss') self.train_accuracy = tf.keras.metrics.Mean(name='train_accuracy') self.transformer = Transformer(num_layers, d_model, num_heads, dff, vocab_size, maximum_position_encoding, dropout_rate) self.checkpoint_path = './checkpoints/train' self.ckpt = tf.train.Checkpoint(transformer=self.transformer, optimizer=self.optimizer) self.ckpt_manager = tf.train.CheckpointManager(self.ckpt, self.checkpoint_path, max_to_keep=5) if self.ckpt_manager.latest_checkpoint: self.ckpt.restore(self.ckpt_manager.latest_checkpoint)
def get_model_gec(): global args, transformer, tokenizer_ro vocab_size = args.dict_size + 2 learning_rate = CustomSchedule(args.d_model) optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) if args.bert is True: transformer = TransformerBert(args.num_layers, args.d_model, args.num_heads, args.dff, vocab_size, vocab_size, model_dir=args.bert_model_dir, pe_input=vocab_size, pe_target=vocab_size, rate=args.dropout, args=args) tf.compat.v1.logging.info('transformer bert loaded') else: transformer = Transformer(args.num_layers, args.d_model, args.num_heads, args.dff, vocab_size, vocab_size, pe_input=vocab_size, pe_target=vocab_size, rate=args.dropout) tf.compat.v1.logging.info('transformer model constructed') return transformer, optimizer
def do_train(): train_iterator, valid_iterator, test_iterator, SRC, TGT = prepare_data_multi30k() src_pad_idx = SRC.vocab.stoi[SRC.pad_token] tgt_pad_idx = TGT.vocab.stoi[TGT.pad_token] src_vocab_size = len(SRC.vocab) tgt_vocab_size = len(TGT.vocab) model = Transformer(n_src_vocab=src_vocab_size, n_trg_vocab=tgt_vocab_size, src_pad_idx=src_pad_idx, trg_pad_idx=tgt_pad_idx, d_word_vec=256, d_model=256, d_inner=512, n_layer=3, n_head=8, dropout=0.1, n_position=200) model.cuda() optimizer = Adam(model.parameters(), lr=5e-4) num_epoch = 10 results = [] model_dir = os.path.join("./checkpoint/transformer") for epoch in range(num_epoch): train_loss, train_accuracy = train_epoch(model, optimizer, train_iterator, tgt_pad_idx, smoothing=False) eval_loss, eval_accuracy = eval_epoch(model, valid_iterator, tgt_pad_idx, smoothing=False) os.makedirs(model_dir, exist_ok=True) model_path = os.path.join(model_dir, f"model_{epoch}.pt") torch.save(model.state_dict(), model_path) results.append({"epoch": epoch, "train_loss": train_loss, "eval_loss": eval_loss}) print("[TIME] --- {} --- [TIME]".format(time.ctime(time.time()))) print("epoch: {}, train_loss: {}, eval_loss: {}".format(epoch, train_loss, eval_loss)) print("epoch: {}, train_accuracy: {}, eval_accuracy: {}".format(epoch, train_accuracy, eval_accuracy)) result_path = os.path.join(model_dir, "result.json") with open(result_path, "w", encoding="utf-8") as writer: json.dump(results, writer, ensure_ascii=False, indent=4)
import pickle import random import time import numpy as np import torch from config import device, logger, data_file, vocab_file from transformer.transformer import Transformer if __name__ == '__main__': # filename = 'transformer.pt' filename = 'BEST' print('loading {}...'.format(filename)) start = time.time() model = Transformer() model.load_state_dict(torch.load(filename)) print('elapsed {} sec'.format(time.time() - start)) model = model.to(device) model.eval() assert (1 == 0) logger.info('loading samples...') start = time.time() with open(data_file, 'rb') as file: data = pickle.load(file) samples = data['valid'] elapsed = time.time() - start logger.info('elapsed: {:.4f} seconds'.format(elapsed)) logger.info('loading vocab...') start = time.time()
def train_net(args): torch.manual_seed(7) np.random.seed(7) checkpoint = args.checkpoint start_epoch = 0 best_loss = float('inf') writer = SummaryWriter() epochs_since_improvement = 0 # Initialize / load checkpoint if checkpoint is None: # model encoder = Encoder(n_src_vocab, args.n_layers_enc, args.n_head, args.d_k, args.d_v, args.d_model, args.d_inner, dropout=args.dropout, pe_maxlen=args.pe_maxlen) decoder = Decoder( sos_id, eos_id, n_tgt_vocab, args.d_word_vec, args.n_layers_dec, args.n_head, args.d_k, args.d_v, args.d_model, args.d_inner, dropout=args.dropout, tgt_emb_prj_weight_sharing=args.tgt_emb_prj_weight_sharing, pe_maxlen=args.pe_maxlen) model = Transformer(encoder, decoder) # print(model) # model = nn.DataParallel(model) # optimizer optimizer = TransformerOptimizer( torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09)) else: checkpoint = torch.load(checkpoint) start_epoch = checkpoint['epoch'] + 1 epochs_since_improvement = checkpoint['epochs_since_improvement'] model = checkpoint['model'] optimizer = checkpoint['optimizer'] # Move to GPU, if available model = model.to(device) # Custom dataloaders train_dataset = AiChallenger2017Dataset('train') train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, collate_fn=pad_collate, shuffle=True, num_workers=args.num_workers) valid_dataset = AiChallenger2017Dataset('valid') valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=args.batch_size, collate_fn=pad_collate, shuffle=False, num_workers=args.num_workers) # Epochs for epoch in range(start_epoch, args.epochs): # One epoch's training train_loss = train(train_loader=train_loader, model=model, optimizer=optimizer, epoch=epoch, logger=logger, writer=writer) writer.add_scalar('epoch/train_loss', train_loss, epoch) writer.add_scalar('epoch/learning_rate', optimizer.lr, epoch) print('\nLearning rate: {}'.format(optimizer.lr)) print('Step num: {}\n'.format(optimizer.step_num)) # One epoch's validation valid_loss = valid(valid_loader=valid_loader, model=model, logger=logger) writer.add_scalar('epoch/valid_loss', valid_loss, epoch) # Check if there was an improvement is_best = valid_loss < best_loss best_loss = min(valid_loss, best_loss) if not is_best: epochs_since_improvement += 1 print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement, )) else: epochs_since_improvement = 0 # Save checkpoint save_checkpoint(epoch, epochs_since_improvement, model, optimizer, best_loss, is_best)
def main(): global char2index global index2char global SOS_token global EOS_token global PAD_token parser = argparse.ArgumentParser(description='Speech hackathon Baseline') parser.add_argument('--batch_size', type=int, default=32, help='batch size in training (default: 32)') parser.add_argument( '--workers', type=int, default=4, help='number of workers in dataset loader (default: 4)') parser.add_argument('--max_epochs', type=int, default=10, help='number of max epochs in training (default: 10)') parser.add_argument('--lr', type=float, default=0.0001, help='learning rate (default: 0.0001)') parser.add_argument('--teacher_forcing', type=float, default=0.5, help='teacher forcing ratio in decoder (default: 0.5)') parser.add_argument('--max_len', type=int, default=WORD_MAXLEN, help='maximum characters of sentence (default: 80)') parser.add_argument('--no_cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') parser.add_argument('--save_name', type=str, default='model', help='the name of model in nsml or local') parser.add_argument('--mode', type=str, default='train') parser.add_argument("--pause", type=int, default=0) parser.add_argument( '--word', action='store_true', help='Train/Predict model using word based label (default: False)') parser.add_argument('--gen_label_index', action='store_true', help='Generate word label index map(default: False)') parser.add_argument('--iteration', type=str, help='Iteratiom') parser.add_argument('--premodel_session', type=str, help='Session name of premodel') # transformer model parameter parser.add_argument('--d_model', type=int, default=128, help='transformer_d_model') parser.add_argument('--n_head', type=int, default=8, help='transformer_n_head') parser.add_argument('--num_encoder_layers', type=int, default=4, help='num_encoder_layers') parser.add_argument('--num_decoder_layers', type=int, default=4, help='transformer_num_decoder_layers') parser.add_argument('--dim_feedforward', type=int, default=2048, help='transformer_d_model') parser.add_argument('--dropout', type=float, default=0.1, help='transformer_dropout') # transformer warmup parameter parser.add_argument('--warmup_multiplier', type=int, default=3, help='transformer_warmup_multiplier') parser.add_argument('--warmup_epoch', type=int, default=10, help='transformer_warmup_epoch') args = parser.parse_args() char_loader = CharLabelLoader() char_loader.load_char2index('./hackathon.labels') label_loader = char_loader if args.word: if args.gen_label_index: generate_word_label_index_file(char_loader, TRAIN_LABEL_CHAR_PATH) from subprocess import call call(f'cat {TRAIN_LABEL_CHAR_PATH}', shell=True) # ??? ??? ??? ?? word_loader = CharLabelLoader() word_loader.load_char2index('./hackathon.pos.labels') label_loader = word_loader if os.path.exists(TRAIN_LABEL_CHAR_PATH): generate_word_label_file(char_loader, word_loader, TRAIN_LABEL_POS_PATH, TRAIN_LABEL_CHAR_PATH) char2index = label_loader.char2index index2char = label_loader.index2char SOS_token = char2index['<s>'] EOS_token = char2index['</s>'] PAD_token = char2index['_'] random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if args.cuda else 'cpu') ############ model print("model: transformer") # model = Transformer(d_model= args.d_model, n_head= args.n_head, num_encoder_layers= args.num_encoder_layers, num_decoder_layers= args.num_decoder_layers, # dim_feedforward= args.dim_feedforward, dropout= args.dropout, vocab_size= len(char2index), sound_maxlen= SOUND_MAXLEN, word_maxlen= WORD_MAXLEN) encoder = Encoder(d_input=128, n_layers=6, n_head=4, d_k=128, d_v=128, d_model=128, d_inner=2048, dropout=0.1, pe_maxlen=SOUND_MAXLEN) decoder = Decoder(sos_id=SOS_token, eos_id=EOS_token, n_tgt_vocab=len(char2index), d_word_vec=128, n_layers=6, n_head=4, d_k=128, d_v=128, d_model=128, d_inner=2048, dropout=0.1, tgt_emb_prj_weight_sharing=True, pe_maxlen=SOUND_MAXLEN) model = Transformer(encoder, decoder) optimizer = TransformerOptimizer( torch.optim.Adam(model.parameters(), lr=0.0004, betas=(0.9, 0.98), eps=1e-09)) ############/ for param in model.parameters(): param.data.uniform_(-0.08, 0.08) model = nn.DataParallel(model).to(device) """ optimizer = optim.Adam(model.module.parameters(), lr=args.lr) scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.max_epochs) scheduler_warmup = GradualWarmupScheduler(optimizer, multiplier=args.warmup_multiplier, total_epoch=args.warmup_epoch, after_scheduler=scheduler_cosine) criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_token).to(device) """ bind_model(model, optimizer) if args.pause == 1: nsml.paused(scope=locals()) if args.mode != "train": return data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv') wav_paths = list() script_paths = list() with open(data_list, 'r') as f: for line in f: # line: "aaa.wav,aaa.label" wav_path, script_path = line.strip().split(',') wav_paths.append(os.path.join(DATASET_PATH, 'train_data', wav_path)) script_paths.append( os.path.join(DATASET_PATH, 'train_data', script_path)) best_loss = 1e10 begin_epoch = 0 # load all target scripts for reducing disk i/o # target_path = os.path.join(DATASET_PATH, 'train_label') target_path = TRAIN_LABEL_CHAR_PATH if args.word: target_path = TRAIN_LABEL_POS_PATH load_targets(target_path) train_batch_num, train_dataset_list, valid_dataset = split_dataset( args, wav_paths, script_paths, valid_ratio=0.05) if args.iteration: if args.premodel_session: nsml.load(args.iteration, session=args.premodel_session) logger.info(f'Load {args.premodel_session} {args.iteration}') else: nsml.load(args.iteration) logger.info(f'Load {args.iteration}') logger.info('start') train_begin = time.time() for epoch in range(begin_epoch, args.max_epochs): # learning rate scheduler train_queue = queue.Queue(args.workers * 2) train_loader = MultiLoader(train_dataset_list, train_queue, args.batch_size, args.workers) train_loader.start() train_loss, train_cer = train(model, train_batch_num, train_queue, optimizer, device, train_begin, args.workers, 10, args.teacher_forcing) logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' % (epoch, train_loss, train_cer)) train_loader.join() print("~~~~~~~~~~~~") if epoch == 10 or (epoch > 48 and epoch % 10 == 9): valid_queue = queue.Queue(args.workers * 2) valid_loader = BaseDataLoader(valid_dataset, valid_queue, args.batch_size, 0) valid_loader.start() eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue, device, args.max_len, args.batch_size) logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer)) valid_loader.join() nsml.report(False, step=epoch, train_epoch__loss=train_loss, train_epoch__cer=train_cer, eval__loss=eval_loss, eval__cer=eval_cer) best_model = (eval_loss < best_loss) nsml.save(args.save_name) if best_model: nsml.save('best') best_loss = eval_loss
def train_net(args): # 为了保证程序执行结果一致, 给随机化设定种子 torch.manual_seed(7) np.random.seed(7) checkpoint = args.checkpoint start_epoch = 0 writer = SummaryWriter() if checkpoint is None: # model encoder = Encoder(Config.vocab_size, args.n_layers_enc, args.n_head, args.d_k, args.d_v, args.d_model, args.d_inner, dropout=args.dropout, pe_maxlen=args.pe_maxlen) decoder = Decoder(Config.sos_id, Config.eos_id, Config.vocab_size, args.d_word_vec, args.n_layers_dec, args.n_head, args.d_k, args.d_v, args.d_model, args.d_inner, dropout=args.dropout, tgt_emb_prj_weight_sharing=args.tgt_emb_prj_weight_sharing, pe_maxlen=args.pe_maxlen) model = Transformer(encoder, decoder) # optimizer optimizer = TransformerOptimizer( torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09)) else: checkpoint = torch.load(checkpoint) start_epoch = checkpoint['epoch'] + 1 model = checkpoint['model'] optimizer = checkpoint['optimizer'] # Move to GPU, if available model = model.to(Config.device) # Custom dataloaders 数据的加载 注意这里指定了一个参数collate_fn代表的数据需要padding train_dataset = TranslateDataset() train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, collate_fn=pad_collate, shuffle=True, num_workers=args.num_workers) # Epochs Loss_list = [] for epoch in range(start_epoch, args.epochs): # One epoch's training train_loss = train(train_loader=train_loader, model=model, optimizer=optimizer, epoch=epoch, logger=logger, writer=writer) l = str(train_loss) Loss_list.append(l) l_temp = l + '\n' with open('loss_epoch.txt', 'a+') as f: f.write(l_temp) writer.add_scalar('epoch/train_loss', train_loss, epoch) writer.add_scalar('epoch/learning_rate', optimizer.lr, epoch) print('\nLearning rate: {}'.format(optimizer.lr)) print('Step num: {}\n'.format(optimizer.step_num)) # Save checkpoint save_checkpoint(epoch, model, optimizer, train_loss) with open('loss.txt', 'w') as f: f.write('\n'.join(Loss_list))
def main() -> None: # Configure command line flags. parser = argparse.ArgumentParser( description='Validate TZ zone files with ZoneSpecifier.') # Extractor flags. parser.add_argument('--input_dir', help='Location of the input directory', required=True) # Transformer flags. parser.add_argument( '--scope', # basic: 241 of the simpler time zones for BasicZoneSpecifier # extended: all 348 time zones for ExtendedZoneSpecifier choices=['basic', 'extended'], help='Size of the generated database (basic|extended)', required=True, ) parser.add_argument( '--start_year', help='Start year of Zone Eras (default: 2000)', type=int, default=2000, ) parser.add_argument( '--until_year', help='Until year of Zone Eras (default: 2038)', type=int, default=2038, ) parser.add_argument( '--granularity', help=( 'If given, overrides the other granularity flags to ' 'truncate UNTIL, AT, STDOFF (offset), SAVE (delta) and ' 'RULES (rulesDelta) fields to this many seconds (default: None)'), type=int, ) parser.add_argument( '--until_at_granularity', help=( 'Truncate UNTIL and AT fields to this many seconds (default: 60)'), type=int, ) parser.add_argument( '--offset_granularity', help=('Truncate STDOFF (offset) fields to this many seconds' '(default: 900 (basic), 60 (extended))'), type=int, ) parser.add_argument( '--delta_granularity', help=('Truncate SAVE (delta) and RULES (rulesDelta) field to this many' 'seconds (default: 900)'), type=int, ) parser.add_argument( '--strict', help='Remove zones and rules not aligned at granularity time boundary', action='store_true', default=True, ) parser.add_argument( '--nostrict', help='Retain zones and rules not aligned at granularity time boundary', action='store_false', dest='strict', ) # Validator flags. parser.add_argument( '--zone', help='Name of time zone to validate (default: all zones)', ) parser.add_argument( '--year', help='Year to validate (default: start_year, until_year)', type=int, ) parser.add_argument('--validate_buffer_size', help='Validate the transition buffer size', action="store_true") parser.add_argument('--validate_test_data', help='Validate the TestDataGenerator with pytz', action="store_true") parser.add_argument( '--validate_dst_offset', # Not enabled by default because pytz DST seems to be buggy. help='Validate the DST offset as well as the total UTC offset', action="store_true") parser.add_argument('--debug_validator', help='Enable debug output from Validator', action="store_true") # ZoneSpecifier flags parser.add_argument( '--viewing_months', help='Number of months to use for calculations (13, 14, 36)', type=int, default=14) parser.add_argument('--debug_specifier', help='Enable debug output from ZoneSpecifier', action="store_true") parser.add_argument( '--in_place_transitions', help='Use in-place Transition array to determine Active Transitions', action="store_true") parser.add_argument('--optimize_candidates', help='Optimize the candidate transitions', action='store_true') # TestDataGenerator flag. # # pytz cannot handle dates after the end of 32-bit Unix time_t type # (2038-01-19T03:14:07Z), see # https://answers.launchpad.net/pytz/+question/262216, so the # validation_until_year cannot be greater than 2038. parser.add_argument( '--validation_start_year', help='Start year of ZoneSpecifier validation (default: start_year)', type=int, default=0) parser.add_argument( '--validation_until_year', help='Until year of ZoneSpecifier validation (default: 2038)', type=int, default=0) # Parse the command line arguments args = parser.parse_args() # Configure logging. This should normally be executed after the # parser.parse_args() because it allows us set the logging.level using a # flag. logging.basicConfig(level=logging.INFO) # Define scope-dependent granularity if not overridden by flag if args.granularity: until_at_granularity = args.granularity offset_granularity = args.granularity delta_granularity = args.granularity else: if args.until_at_granularity: until_at_granularity = args.until_at_granularity else: until_at_granularity = 60 if args.offset_granularity: offset_granularity = args.offset_granularity else: if args.scope == 'basic': offset_granularity = 900 else: offset_granularity = 60 if args.delta_granularity: delta_granularity = args.delta_granularity else: delta_granularity = 900 logging.info('Granularity for UNTIL/AT: %d', until_at_granularity) logging.info('Granularity for STDOFF (offset): %d', offset_granularity) logging.info( 'Granularity for RULES (rulesDelta) and SAVE (delta): %d', delta_granularity, ) # Extract the TZ files logging.info('======== Extracting TZ Data files') extractor = Extractor(args.input_dir) extractor.parse() extractor.print_summary() policies_map, zones_map, links_map = extractor.get_data() # Create initial TransformerResult tresult = TransformerResult( zones_map=zones_map, policies_map=policies_map, links_map=links_map, removed_zones={}, removed_policies={}, removed_links={}, notable_zones={}, notable_policies={}, notable_links={}, zone_ids={}, letters_per_policy={}, letters_map={}, formats_map={}, ) # Transform the TZ zones and rules logging.info('======== Transforming Zones and Rules') logging.info('Extracting years [%d, %d)', args.start_year, args.until_year) transformer = Transformer( tresult=tresult, scope=args.scope, start_year=args.start_year, until_year=args.until_year, until_at_granularity=until_at_granularity, offset_granularity=offset_granularity, delta_granularity=delta_granularity, strict=args.strict, ) transformer.transform() transformer.print_summary() tresult = transformer.get_data() # Generate internal versions of zone_infos and zone_policies # so that ZoneSpecifier can be created. logging.info('======== Generating inlined zone_infos and zone_policies') inline_zone_info = InlineZoneInfo(tresult.zones_map, tresult.policies_map) zone_infos, zone_policies = inline_zone_info.generate_zonedb() logging.info('Inlined zone_infos=%d; zone_policies=%d', len(zone_infos), len(zone_policies)) # Set the defaults for validation_start_year and validation_until_year # if they were not specified. validation_start_year = (args.start_year if args.validation_start_year == 0 else args.validation_start_year) validation_until_year = (args.until_year if args.validation_until_year == 0 else args.validation_until_year) validate( zone_infos=zone_infos, zone_policies=zone_policies, zone=args.zone, year=args.year, start_year=validation_start_year, until_year=validation_until_year, validate_buffer_size=args.validate_buffer_size, validate_test_data=args.validate_test_data, viewing_months=args.viewing_months, validate_dst_offset=args.validate_dst_offset, debug_validator=args.debug_validator, debug_specifier=args.debug_specifier, in_place_transitions=args.in_place_transitions, optimize_candidates=args.optimize_candidates, ) logging.info('======== Finished processing TZ Data files.')
def sample_output( model: transformer.Transformer, input_seq: torch.LongTensor, eos_index: int, pad_index: int, max_len: int ) -> torch.LongTensor: """Samples an output sequence based on the provided input. Args: model (:class:`transformer.Transformer`): The model to use. input_seq (torch.LongTensor): The input sequence to be provided to the model. This has to be a (batch-size x input-seq-len)-tensor. eos_index (int): The index that indicates the end of a sequence. pad_index (int): The index that indicates a padding token in a sequence. max_len (int): The maximum length of the generated output. Returns: torch.LongTensor: The generated output sequence as (batch-size x output-seq-len)-tensor. """ # sanitize args if not isinstance(model, transformer.Transformer): raise TypeError("The <model> has to be a transformer.Transformer!") if not isinstance(input_seq, torch.LongTensor) and not isinstance(input_seq, torch.cuda.LongTensor): raise TypeError("The <input_seq> has to be a LongTensor!") if input_seq.dim() != 2: raise ValueError("<input_seq> has to be a matrix!") if not isinstance(eos_index, int): raise TypeError("The <eos_index> has to be an integer!") if eos_index < 0 or eos_index >= model.output_size: raise ValueError("The <eos_index> is not a legal index in the vocabulary used by <model>!") if not isinstance(pad_index, int): raise TypeError("The <pad_index> has to be an integer!") if pad_index < 0 or pad_index >= model.output_size: raise ValueError("The <pad_index> is not a legal index in the vocabulary used by <model>!") if max_len is not None: if not isinstance(max_len, int): raise TypeError("<max_len> has to be an integer!") if max_len < 1: raise ValueError("<max_len> has to be > 0!") original_mode = model.training # the original mode (train/eval) of the provided model batch_size = input_seq.size(0) # number of samples in the provided input sequence # put model in evaluation mode model.eval() output_seq = [] # used to store the generated outputs for each position finished = [False] * batch_size for _ in range(max_len): # prepare the target to provide to the model # this is the current output with an additional final entry that is supposed to be predicted next # (which is why the concrete value does not matter) current_target = torch.cat(output_seq + [input_seq.new(batch_size, 1).zero_()], dim=1) # run the model probs = model(input_seq, current_target)[:, -1, :] # sample next output form the computed probabilities output = torch.multinomial(probs, 1) # determine which samples have been finished, and replace sampled output with padding for those that are already for sample_idx in range(batch_size): if finished[sample_idx]: output[sample_idx, 0] = pad_index elif output[sample_idx, 0].item() == eos_index: finished[sample_idx] = True # store created output output_seq.append(output) # check whether generation has been finished if all(finished): break # restore original mode of the model model.train(mode=original_mode) return torch.cat(output_seq, dim=1)
if __name__ == '__main__': MAX_SEQUENCE_LENGTH = 10 # [number of tokens] # for reproducible results: make_results_reproducible() print("\nA Toy Source-to-Target Copy Task") # ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ # initializing the model: model = Transformer(src_vocabulary_dimension=11, tgt_vocabulary_dimension=11, n_encoder_blocks=6, n_decoder_blocks=6, representation_dimension=512, feedforward_dimension=2048, n_attention_heads=8, max_sequence_length=MAX_SEQUENCE_LENGTH, dropout_prob=0.1) # ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ # evaluating a single prediction before training: print("\nEvaluating a single prediction before training:") src_sequence = torch_unsqueeze( input=tensor(list(range(1, MAX_SEQUENCE_LENGTH + 1))), # noqa: E501 pylint: disable=not-callable # input=tensor([2] * MAX_SEQUENCE_LENGTH), # TODO # noqa: E501 pylint: disable=not-callable dim=0) src_sequence_mask = torch_ones((1, 1, MAX_SEQUENCE_LENGTH)) tgt_sequence_prediction = model.predict(src_sequences=src_sequence, src_masks=src_sequence_mask,
pe_maxlen=args.pe_maxlen) decoder = Decoder( sos_id, eos_id, vocab_size, args.d_word_vec, args.n_layers_dec, args.n_head, args.d_k, args.d_v, args.d_model, args.d_inner, dropout=args.dropout, tgt_emb_prj_weight_sharing=args.tgt_emb_prj_weight_sharing, pe_maxlen=args.pe_maxlen) model = Transformer(encoder, decoder) for i in range(3): print("\n***** Utt", i + 1) Ti = i + 20 input = torch.randn(Ti, D) length = torch.tensor([Ti], dtype=torch.int) nbest_hyps = model.recognize(input, length, char_list, args) file_path = "./temp.pth" optimizer = torch.optim.Adam(model.parameters()) torch.save(model.serialize(model, optimizer, 1, LFR_m=1, LFR_n=1), file_path) model, LFR_m, LFR_n = Transformer.load_model(file_path) print(model)
def eval_probability( model: transformer.Transformer, input_seq: torch.LongTensor, target_seq: torch.LongTensor, pad_index: int=None ) -> torch.FloatTensor: """Computes the probability that the provided model computes a target sequence given an input sequence. Args: model (:class:`transformer.Transformer`): The model to use. input_seq (torch.LongTensor): The input sequence to be provided to the model. This has to be a (batch-size x input-seq-len)-tensor. target_seq (torch.LongTensor): The target sequence whose probability is being evaluated. This has to be a (batch-size x target-seq-len)-tensor. pad_index (int, optional): The index that indicates a padding token in a sequence. If ``target_seq`` is padded, then the ``pad_index`` has to be provided in order to allow for computing the probabilities for relevant parts of the target sequence only. Returns: torch.FloatTensor: A 1D-tensor of size (batch-size), which contains one probability for each sample in ``input_seq`` and ``target_seq``, respectively. """ if not isinstance(model, transformer.Transformer): raise TypeError("The <model> has to be a transformer.Transformer!") if not isinstance(input_seq, torch.LongTensor) and not isinstance(input_seq, torch.cuda.LongTensor): raise TypeError("The <input_seq> has to be a LongTensor!") if input_seq.dim() != 2: raise ValueError("<input_seq> has to be a 2D-tensor!") if input_seq.is_cuda: if not isinstance(target_seq, torch.cuda.LongTensor): raise TypeError("The <target_seq> has to be of the same type as <input_seq>, i.e., cuda.LongTensor!") elif not isinstance(target_seq, torch.LongTensor): raise TypeError("The <target_seq> has to be of the same type as <input_seq>, i.e., LongTensor!") if target_seq.dim() != 2: raise ValueError("<input_seq> has to be a 2D-tensor!") if input_seq.size(0) != target_seq.size(0): raise ValueError("<input_seq> and <target_seq> use different batch sizes!") if pad_index is not None and not isinstance(pad_index, int): raise TypeError("The <pad_index>, if provided, has to be an integer!") batch_size = input_seq.size(0) max_seq_len = input_seq.size(1) # put model in evaluation mode original_mode = model.training # store original mode (train/eval) to be restored eventually model.eval() # run the model to compute the needed probabilities predictions = model(input_seq, target_seq) # determine the lengths of the target sequences if pad_index is not None: mask = util.create_padding_mask(target_seq, pad_index)[:, 0, :] seq_len = mask.sum(dim=1).cpu().numpy().tolist() else: seq_len = (np.ones(batch_size, dtype=np.long) * max_seq_len).tolist() # compute the probabilities for each of the provided samples sample_probs = torch.ones(batch_size) for sample_idx in range(batch_size): # iterate over each sample for token_idx in range(seq_len[sample_idx]): # iterate over each position in the output sequence sample_probs[sample_idx] *= predictions[sample_idx, token_idx, target_seq[sample_idx, token_idx]].item() # restore original mode of the model model.train(mode=original_mode) return sample_probs
""" import time import torch from transformer.transformer import Transformer if __name__ == '__main__': ''' 从tar中提取模型 整理成pt文件 ''' checkpoint = 'BEST_Model.tar' print('loading {}...'.format(checkpoint)) start = time.time() checkpoint = torch.load(checkpoint) print('elapsed {} sec'.format(time.time() - start)) model = checkpoint['model'] print(type(model)) filename = 'reading_comprehension.pt' print('saving {}...'.format(filename)) start = time.time() torch.save(model.state_dict(), filename) print('elapsed {} sec'.format(time.time() - start)) print('loading {}...'.format(filename)) start = time.time() model = Transformer() model.load_state_dict(torch.load(filename)) print('elapsed {} sec'.format(time.time() - start))
def main() -> None: """ Main driver for TZ Database compiler which parses the IANA TZ Database files located at the --input_dir and generates zoneinfo files and validation datasets for unit tests at --output_dir. Usage: tzcompiler.py [flags...] """ # Configure command line flags. parser = argparse.ArgumentParser(description='Generate Zone Info.') # Extractor flags. parser.add_argument('--input_dir', help='Location of the input directory', required=True) # Transformer flags. parser.add_argument( '--scope', # basic: 241 of the simpler time zones for BasicZoneSpecifier # extended: all 348 time zones for ExtendedZoneSpecifier choices=['basic', 'extended'], help='Size of the generated database (basic|extended)', required=True) parser.add_argument('--start_year', help='Start year of Zone Eras (default: 2000)', type=int, default=2000) parser.add_argument('--until_year', help='Until year of Zone Eras (default: 2038)', type=int, default=2038) parser.add_argument( '--granularity', help=( 'If given, overrides the other granularity flags to ' 'truncate UNTIL, AT, STDOFF (offset), SAVE (delta) and ' 'RULES (rulesDelta) fields to this many seconds (default: None)'), type=int, ) parser.add_argument( '--until_at_granularity', help=( 'Truncate UNTIL and AT fields to this many seconds (default: 60)'), type=int, ) parser.add_argument( '--offset_granularity', help=('Truncate STDOFF (offset) fields to this many seconds' '(default: 900 (basic), 60 (extended))'), type=int, ) parser.add_argument( '--delta_granularity', help=('Truncate SAVE (delta) and RULES (rulesDelta) field to this many' 'seconds (default: 900)'), type=int, ) # Make --strict the default, --nostrict optional. parser.add_argument( '--strict', help='Remove zones and rules not aligned at granularity time boundary', action='store_true', default=True, ) parser.add_argument( '--nostrict', help='Retain zones and rules not aligned at granularity time boundary', action='store_false', dest='strict', ) # Data pipeline selectors. Reduced down to a single 'zonedb' option which # is the default. parser.add_argument( '--action', help='Action to perform (zonedb)', default='zonedb', ) # Language selector (for --action zonedb). parser.add_argument( '--language', help='Comma-separated list of target languages ' '(arduino|python|json|zonelist)', default='', ) # C++ namespace names for '--language arduino'. If not specified, it will # automatically be set to 'zonedb' or 'zonedbx' depending on the 'scope'. parser.add_argument( '--db_namespace', help='C++ namespace for the zonedb files (default: zonedb or zonedbx)', ) # For language=json, specify the output file. parser.add_argument( '--json_file', help='The JSON output file (default: zonedb.json)', default='zonedb.json', ) # The tz_version does not affect any data processing. Its value is # copied into the various generated files and usually placed in the # comments section to describe the source of the data that generated the # various files. parser.add_argument( '--tz_version', help='Version string of the TZ files', required=True, ) # Target location of the generated files. parser.add_argument( '--output_dir', help='Location of the output directory', default='', ) # Flag to ignore max_buf_size check. Needed on ExtendedHinnantDateTest if we # want to test the extended year range from 1974 to 2050, because one of the # zones requires a buf_size=9, but ExtendedZoneProcessor only supports 8. parser.add_argument( '--ignore_buf_size_too_large', help='Ignore transition buf size too large', action='store_true', ) # Parse the command line arguments args = parser.parse_args() # Manually parse the comma-separated --action. languages = set(args.language.split(',')) allowed_languages = set(['arduino', 'python', 'json', 'zonelist']) if not languages.issubset(allowed_languages): print(f'Invalid --language: {languages - allowed_languages}') sys.exit(1) # Configure logging. This should normally be executed after the # parser.parse_args() because it allows us set the logging.level using a # flag. logging.basicConfig(level=logging.INFO) # How the script was invoked invocation = ' '.join(sys.argv) # Define scope-dependent granularity if not overridden by flag if args.granularity: until_at_granularity = args.granularity offset_granularity = args.granularity delta_granularity = args.granularity else: if args.until_at_granularity: until_at_granularity = args.until_at_granularity else: until_at_granularity = 60 if args.offset_granularity: offset_granularity = args.offset_granularity else: if args.scope == 'basic': offset_granularity = 900 else: offset_granularity = 60 if args.delta_granularity: delta_granularity = args.delta_granularity else: delta_granularity = 900 logging.info('======== TZ Compiler settings') logging.info(f'Scope: {args.scope}') logging.info( f'Start year: {args.start_year}; Until year: {args.until_year}') logging.info(f'Strict: {args.strict}') logging.info(f'TZ Version: {args.tz_version}') logging.info('Ignore too large transition buf_size: ' f'{args.ignore_buf_size_too_large}') logging.info('Granularity for UNTIL/AT: %d', until_at_granularity) logging.info('Granularity for STDOFF (offset): %d', offset_granularity) logging.info( 'Granularity for RULES (rulesDelta) and SAVE (delta): %d', delta_granularity, ) # Extract the TZ files logging.info('======== Extracting TZ Data files') extractor = Extractor(args.input_dir) extractor.parse() extractor.print_summary() policies_map, zones_map, links_map = extractor.get_data() # Create initial TransformerResult tresult = TransformerResult( zones_map=zones_map, policies_map=policies_map, links_map=links_map, removed_zones={}, removed_policies={}, removed_links={}, notable_zones={}, notable_policies={}, notable_links={}, zone_ids={}, letters_per_policy={}, letters_map={}, formats_map={}, ) # Transform the TZ zones and rules logging.info('======== Transforming Zones and Rules') logging.info('Extracting years [%d, %d)', args.start_year, args.until_year) transformer = Transformer( tresult=tresult, scope=args.scope, start_year=args.start_year, until_year=args.until_year, until_at_granularity=until_at_granularity, offset_granularity=offset_granularity, delta_granularity=delta_granularity, strict=args.strict, ) transformer.transform() transformer.print_summary() tresult = transformer.get_data() # Generate the fields for the Arduino zoneinfo data. logging.info('======== Transforming to Arduino Zones and Rules') arduino_transformer = ArduinoTransformer( tresult=tresult, scope=args.scope, start_year=args.start_year, until_year=args.until_year, ) arduino_transformer.transform() arduino_transformer.print_summary() tresult = arduino_transformer.get_data() # Estimate the buffer size of ExtendedZoneProcessor.TransitionStorage. logging.info('======== Estimating transition buffer sizes') logging.info('Checking years in [%d, %d)', args.start_year, args.until_year) estimator = BufSizeEstimator( zones_map=tresult.zones_map, policies_map=tresult.policies_map, start_year=args.start_year, until_year=args.until_year, ) buf_size_info: BufSizeInfo = estimator.estimate() # Check if the estimated buffer size is too big if buf_size_info['max_buf_size'] > EXTENDED_ZONE_PROCESSOR_MAX_TRANSITIONS: msg = (f"Max buffer size={buf_size_info['max_buf_size']} " f"is larger than ExtendedZoneProcessor.kMaxTransitions=" f"{EXTENDED_ZONE_PROCESSOR_MAX_TRANSITIONS}") if args.ignore_buf_size_too_large: logging.warning(msg) else: raise Exception(msg) # Collect TZ DB data into a single JSON-serializable object. zidb = create_zone_info_database( tz_version=args.tz_version, tz_files=Extractor.ZONE_FILES, scope=args.scope, start_year=args.start_year, until_year=args.until_year, until_at_granularity=until_at_granularity, offset_granularity=offset_granularity, delta_granularity=delta_granularity, strict=args.strict, tresult=tresult, buf_size_info=buf_size_info, ) if args.action == 'zonedb': logging.info('======== Generating zonedb files') for language in languages: generate_zonedb( invocation=invocation, db_namespace=args.db_namespace, language=language, output_dir=args.output_dir, zidb=zidb, json_file=args.json_file, ) else: logging.error(f"Unrecognized action '{args.action}'") sys.exit(1) logging.info('======== Finished processing TZ Data files.')
""" # flat_shape = hp["numcep"] * hp["nb_time"] d_input = hp["numcep"] label_shape = len(train_speaker_list) # model d_m = hp["d_m"] encoder = Encoder(d_input=d_input, n_layers=2, d_k=d_m, d_v=d_m, d_m=d_m, d_ff=hp["d_ff"], dropout=0.1).to(device) pooling = SelfAttentionPooling(d_m, dropout=0.1).to(device) model = Transformer(encoder, pooling, d_m, label_shape, dropout=0.2).to(device) opt = torch.optim.Adam(model.parameters(), lr=hp["lr"], weight_decay=hp["weight_decay"]) loss_func = torch.nn.CrossEntropyLoss() best_eer = 99. if hp["comet"]: with experiment.train(): for epoch in tqdm(range(epochs)): cce_loss = fit(model, loss_func, opt, train_ds_gen, device) experiment.log_metric("cce", cce_loss, epoch=epoch) val_eer = test(model,
pe_maxlen=args.pe_maxlen) decoder = Decoder( sos_id, eos_id, vocab_size, args.d_word_vec, args.n_layers_dec, args.n_head, args.d_k, args.d_v, args.d_model, args.d_inner, dropout=args.dropout, tgt_emb_prj_weight_sharing=args.tgt_emb_prj_weight_sharing, pe_maxlen=args.pe_maxlen) model = Transformer(encoder, decoder) optimizer = TransformerOptimizer( torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09), args.k, args.d_model, args.warmup_steps) print(args.k) print(args.d_model) print(args.warmup_steps) lr_list = [] for step_num in range(1, 50000): # print(step_num) lr_1 = k * init_lr * min(step_num**(-0.5), step_num * (warmup_steps**(-1.5))) optimizer.step()
import time import numpy as np import torch from config import Config, logger from transformer.transformer import Transformer if __name__ == '__main__': # 先去执行export.py 把模型导出来 filename = 'reading_comprehension.pt' # 导出模型所放的位置 print('loading {}...'.format(filename)) start = time.time() model = Transformer() model.load_state_dict(torch.load(filename)) print('elapsed {} sec'.format(time.time() - start)) model = model.to(Config.device) model.eval() # 加载测试集 logger.info('loading samples...') start = time.time() with open(Config.data_file, 'rb') as file: data = pickle.load(file) samples = data elapsed = time.time() - start logger.info('elapsed: {:.4f} seconds'.format(elapsed))
class Translator(nn.Module): def __init__(self, vocabulary_size_in, vocabulary_size_out, constants, hyperparams): super(Translator, self).__init__() self.Transformer = Transformer(vocabulary_size_in, vocabulary_size_out, constants, hyperparams) self.criterion = nn.CrossEntropyLoss() self.optimizer = optim.Adam(self.Transformer.parameters(), betas=(0.9, 0.98), eps=1e-9) self.scheduler = Scheduler(d_model=hyperparams.D_MODEL, warmup_steps=hyperparams.WARMUP_STEPS) self.constants = constants self.hyperparams = hyperparams def count_parameters(self): return sum(p.numel() for p in self.parameters() if p.requires_grad) def fit(self, training_steps, data_training, data_eval=None): ''' Arg: data_training: iterator which gives two batches: one of source language and one for target language ''' writer = SummaryWriter() training_loss, gradient_norm = [], [] for i in tqdm(range(training_steps)): X, Y = next(data_training) batch_size = X.shape[0] bos = torch.zeros(batch_size, 1).fill_(self.constants.BOS_IDX).to( self.constants.DEVICE, torch.LongTensor) translation = torch.cat((bos, Y[:, :-1]), dim=1) output = self.Transformer(X, translation) output = output.contiguous().view(-1, output.size(-1)) target = Y.contiguous().view(-1) lr = self.scheduler.step() for p in self.optimizer.param_groups: p['lr'] = lr self.optimizer.zero_grad() loss = self.criterion(output, target) training_loss.append(loss.item()) loss.backward() self.optimizer.step() temp = 0 for p in self.Transformer.parameters(): temp += torch.sum(p.grad.data**2) temp = np.sqrt(temp.cpu()) gradient_norm.append(temp) if ((i + 1) % self.hyperparams.EVAL_EVERY_TIMESTEPS) == 0: torch.save(self.state_dict(), self.constants.WEIGHTS_FILE) writer.add_scalar('0_training_set/loss', np.mean(training_loss), i) writer.add_scalar('0_training_set/gradient_norm', np.mean(gradient_norm), i) writer.add_scalar('2_other/lr', lr, i) training_loss, gradient_norm = [], [] if data_eval: eval_references = [] eval_hypotheses = [] for l, (X_batch, Y_batch) in enumerate(data_eval): for i in range(Y_batch.shape[0]): eval_references.append(data_eval.itotok( Y_batch[i])) hypotheses = self.translate(X_batch) for i in range(len(hypotheses)): eval_hypotheses.append( data_eval.itotok(hypotheses[i])) def subwords_to_string(subwords): string = "" for subword in subwords: if subword[-2:] == "@@": string += subword[:-2] elif subword != self.constants.PADDING_WORD: string += subword + " " return string for i, (ref, hyp) in enumerate( zip(eval_references, eval_hypotheses)): eval_references[i] = subwords_to_string(ref) eval_hypotheses[i] = subwords_to_string(hyp) ex_phrases = '' for i, (ref, hyp) in enumerate( zip(eval_references, eval_hypotheses)): ex_phrases = ex_phrases + "\n truth: " + ref + "\n prediction: " + hyp + "\n" if i == 4: break BLEU = nltk.translate.bleu_score.corpus_bleu( eval_references, eval_hypotheses) writer.add_scalar('1_eval_set/BLEU', BLEU, i) writer.add_text('examples', ex_phrases, i) def translate(self, X): ''' Arg: X: batch of phrases to translate: tensor(nb_texts, nb_tokens) ''' self.train(False) batch_size, max_seq = X.shape max_seq += 10 #TODO: remove hard code temp = torch.zeros(batch_size, max_seq).type(torch.LongTensor).to( self.constants.DEVICE) temp[:, 0] = self.constants.BOS_IDX enc = self.Transformer.forward_encoder(X) for j in range(1, max_seq): output = self.Transformer.forward_decoder(X, enc, temp) output = torch.argmax(output, dim=-1) temp[:, j] = output[:, j - 1] #remove padding translations = [] for translation in temp: temp2 = [] for i in range(max_seq): if translation[i] == self.constants.PADDING_IDX: break if i != 0: temp2.append(translation[i]) translations.append(temp2) return translations