def __init__(self, config_path, model_path, model_type): print(config_path) print(model_path) print(model_type) self.model_type = model_type configs = prepare_configs(config_path) data_configs = configs['data_configs'] model_configs = configs['model_configs'] vocab_src = Vocabulary.build_from_file( **data_configs['vocabularies'][0]) vocab_tgt = Vocabulary.build_from_file( **data_configs['vocabularies'][1]) nmt_model = build_model(n_src_vocab=vocab_src.max_n_words, n_tgt_vocab=vocab_tgt.max_n_words, padding_idx=vocab_src.pad, **model_configs) params = load_model_parameters(model_path, map_location="cpu") nmt_model.load_state_dict(params) nmt_model.cuda() nmt_model.eval() self.model = nmt_model self.data_configs = data_configs self.model_configs = model_configs self.vocab_src = vocab_src self.vocab_tgt = vocab_tgt
def get_avg_UNK_dist(config_path, model_path, batch_size=50, reload=True): """ get average UNK distance :param config_path: the configuration to victim model for embedding construction :param model_path: the model parameter path :return: """ # load configs with open(config_path.strip()) as f: configs = yaml.load(f, Loader=yaml.FullLoader) data_configs = configs["data_configs"] model_configs = configs["model_configs"] # load vocabulary file src_vocab = Vocabulary(**data_configs["vocabularies"][0]) # load embedding from model emb = nn.Embedding(num_embeddings=src_vocab.max_n_words, embedding_dim=model_configs["d_word_vec"], padding_idx=PAD) model_params = torch.load(model_path, map_location="cpu") emb.load_state_dict( { "weight": model_params["model"]["encoder.embeddings.embeddings.weight"] }, strict=True) # len_mat = torch.sum(emb.weight**2, dim=1)**0.5 # length of the embeddings # get max range of UNK to any of the embeddings. subtraction = emb.weight - emb.weight[UNK] # len_mat = torch.sum(subtraction**2, dim=1)**0.5 return torch.abs(subtraction).max()
def __init__(self, vocab_file, corpus_dir, video_path, phase, DEBUG=False): """ :param phase: 'train', 'dev', 'test' """ self.vocab_file = vocab_file self.image_type = 'png' self.max_video_len = 300 self.corpus_dir = corpus_dir self.video_path = video_path self.phase = phase self.sample = True self.input_shape = 112 self.alignment = {} self.vocab = Vocabulary(self.vocab_file) normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) self.transform = transforms.Compose([ transforms.Resize((128, 128)), transforms.RandomCrop(self.input_shape), transforms.ToTensor(), normalize, ]) self.test_transform = transforms.Compose([ transforms.Resize((128, 128)), transforms.CenterCrop(self.input_shape), transforms.ToTensor(), normalize, ]) self.phoenix_dataset = self.load_video_list() self.data_dict = self.phoenix_dataset[phase] if DEBUG == True: self.data_dict = self.data_dict[:101] logging.info('[DATASET: {:s}]: total {:d} samples.'.format( phase, len(self.data_dict)))
def train(config_path, model_path, model_type, src_filename, trg_filename): """ flags: saveto: str reload: store_true config_path: str pretrain_path: str, default="" model_name: str log_path: str """ # ================================================================================== # # Initialization for training on different devices # - CPU/GPU # - Single/Distributed Constants.USE_GPU = True print(config_path) print(model_path) print(model_type) world_size = 1 rank = 0 local_rank = 0 if Constants.USE_GPU: torch.cuda.set_device(local_rank) Constants.CURRENT_DEVICE = "cuda:{0}".format(local_rank) else: Constants.CURRENT_DEVICE = "cpu" # ================================================================================== # # Parsing configuration files # - Load default settings # - Load pre-defined settings # - Load user-defined settings configs = prepare_configs(config_path) data_configs = configs['data_configs'] model_configs = configs['model_configs'] training_configs = configs['training_configs'] INFO(pretty_configs(configs)) Constants.SEED = training_configs['seed'] set_seed(Constants.SEED) timer = Timer() # ================================================================================== # # Load Data INFO('Loading data...') timer.tic() # Generate target dictionary vocab_src = Vocabulary.build_from_file(**data_configs['vocabularies'][0]) vocab_tgt = Vocabulary.build_from_file(**data_configs['vocabularies'][1]) Constants.EOS = vocab_src.eos Constants.PAD = vocab_src.pad Constants.BOS = vocab_src.bos valid_bitext_dataset = ZipDataset( TextLineDataset( data_path=src_filename, vocabulary=vocab_src, max_len=100, is_train_dataset=False, ), TextLineDataset( data_path=trg_filename, vocabulary=vocab_tgt, is_train_dataset=False, max_len=100, )) valid_iterator = DataIterator(dataset=valid_bitext_dataset, batch_size=20, use_bucket=training_configs['use_bucket'], buffer_size=training_configs['buffer_size'], numbering=True, world_size=world_size, rank=rank) INFO('Done. Elapsed time {0}'.format(timer.toc())) # ================================ Begin ======================================== # # Build Model & Optimizer # We would do steps below on after another # 1. build models & criterion # 2. move models & criterion to gpu if needed # 3. load pre-trained model if needed # 4. build optimizer # 5. build learning rate scheduler if needed # 6. load checkpoints if needed # 0. Initial # 1. Build Model & Criterion INFO('Building model...') timer.tic() nmt_model = build_model(n_src_vocab=vocab_src.max_n_words, n_tgt_vocab=vocab_tgt.max_n_words, padding_idx=vocab_src.pad, vocab_src=vocab_src, **model_configs) INFO(nmt_model) # 2. Move to GPU if Constants.USE_GPU: nmt_model = nmt_model.cuda() # 3. Load pretrained model if needed load_pretrained_model(nmt_model, model_path, device=Constants.CURRENT_DEVICE) INFO('Done. Elapsed time {0}'.format(timer.toc())) # ================================================================================== # # Prepare training sent_per_sec_meter = TimeMeter() tok_per_sec_meter = TimeMeter() grad_denom = 0 train_loss = 0.0 cum_n_words = 0 valid_loss = best_valid_loss = float('inf') sent_per_sec_meter.start() tok_per_sec_meter.start() INFO('Begin training...') eidx = 0 uidx = 0 score_result = dict() # Build iterator and progress bar training_iter = valid_iterator.build_generator() training_progress_bar = tqdm(desc=' - (Epc {}, Upd {}) '.format( eidx, uidx), total=len(valid_iterator), unit="sents") for batch in training_iter: seqs_numbers, seqs_x, seqs_y = batch batch_size = len(seqs_x) cum_n_words += sum(len(s) for s in seqs_y) try: # Prepare data x, y = prepare_data(seqs_x, seqs_y, cuda=Constants.USE_GPU) y_inp = y[:, :-1].contiguous() y_label = y[:, 1:].contiguous() # [batch_size, seq_len] log_probs = nmt_model( x, y_inp, log_probs=True) # [batch_size, seq_len, vocab_size] _, seq_len = y_label.shape log_probs = log_probs.view(-1, vocab_tgt.max_n_words) y_label = y_label.view(-1) loss = F.nll_loss(log_probs, y_label, reduce=False, ignore_index=vocab_tgt.pad) loss = loss.view(batch_size, seq_len) loss = loss.sum(-1) y_label = y_label.view(batch_size, seq_len) valid_token = (y_label != vocab_tgt.pad).sum(-1) loss = loss.double().div(valid_token.double()) for seq_num, l in zip(seqs_numbers, loss): assert seq_num not in score_result score_result.update({seq_num: l.item()}) uidx += 1 grad_denom += batch_size except RuntimeError as e: if 'out of memory' in str(e): print('| WARNING: ran out of memory, skipping batch') else: raise e if training_progress_bar is not None: training_progress_bar.update(batch_size) training_progress_bar.set_description( ' - (Epc {}, Upd {}) '.format(eidx, uidx)) postfix_str = 'TrainLoss: {:.2f}, ValidLoss(best): {:.2f} ({:.2f}), '.format( train_loss, valid_loss, best_valid_loss) training_progress_bar.set_postfix_str(postfix_str) training_progress_bar.close() return score_result
def test_data(flags): Constants.USE_GPU = flags.use_gpu world_size = 1 rank = 0 local_rank = 0 if Constants.USE_GPU: torch.cuda.set_device(local_rank) Constants.CURRENT_DEVICE = "cuda:{0}".format(local_rank) else: Constants.CURRENT_DEVICE = "cpu" # ================================================================================== # # Parsing configuration files # - Load default settings # - Load pre-defined settings # - Load user-defined settings configs = prepare_configs(flags.config_path, flags.predefined_config) data_configs = configs['data_configs'] model_configs = configs['model_configs'] training_configs = configs['training_configs'] bt_configs = configs['bt_configs'] if 'bt_configs' in configs else None if bt_configs is not None: print("btconfigs ", bt_configs) if 'bt_attribute_data' not in bt_configs: Constants.USE_BT = False bt_configs = None else: Constants.USE_BT = True Constants.USE_BTTAG = bt_configs['use_bttag'] Constants.USE_CONFIDENCE = bt_configs['use_confidence'] INFO(pretty_configs(configs)) Constants.SEED = training_configs['seed'] set_seed(Constants.SEED) timer = Timer() # ================================================================================== # # Load Data INFO('Loading data...') timer.tic() # Generate target dictionary vocab_src = Vocabulary.build_from_file(**data_configs['vocabularies'][0]) vocab_tgt = Vocabulary.build_from_file(**data_configs['vocabularies'][1]) Constants.EOS = vocab_src.eos Constants.PAD = vocab_src.pad Constants.BOS = vocab_src.bos valid_bitext_dataset = ZipDataset( TextLineDataset(data_path=data_configs['valid_data'][0], vocabulary=vocab_src, is_train_dataset=False, ), TextLineDataset(data_path=data_configs['valid_data'][1], vocabulary=vocab_tgt, is_train_dataset=False ) ) valid_iterator = DataIterator(dataset=valid_bitext_dataset, batch_size=training_configs['valid_batch_size'], use_bucket=True, buffer_size=100000, numbering=True, world_size=world_size, rank=rank, shuffle=False) INFO('Done. Elapsed time {0}'.format(timer.toc())) # ================================ Begin ======================================== # # Build Model & Optimizer # We would do steps below on after another # 1. build models & criterion # 2. move models & criterion to gpu if needed # 3. load pre-trained model if needed # 4. build optimizer # 5. build learning rate scheduler if needed # 6. load checkpoints if needed # 0. Initial # 1. Build Model & Criterion INFO('Building model...') timer.tic() nmt_model = build_model(n_src_vocab=vocab_src.max_n_words, n_tgt_vocab=vocab_tgt.max_n_words, padding_idx=vocab_src.pad, vocab_src=vocab_src, vocab_tgt=vocab_tgt, **model_configs) INFO(nmt_model) # 2. Move to GPU if Constants.USE_GPU: nmt_model = nmt_model.cuda() # 3. Load pretrained model if needed load_pretrained_model(nmt_model, flags.pretrain_path, exclude_prefix=flags.pretrain_exclude_prefix, device=Constants.CURRENT_DEVICE) nmt_model = nmt_model.encoder INFO('Done. Elapsed time {0}'.format(timer.toc())) INFO('Begin training...') # 计算train集合每个句子的表示:mean pool nmt_model.eval() # 计算test集合每个句子的表示: mean pool valid_iter = valid_iterator.build_generator() all_seq_numbers = [] all_mean_encoder_hidden = None for batch in valid_iter: bt_attrib = None seq_numbers, seqs_x, seqs_y = batch all_seq_numbers.extend(seq_numbers) x = prepare_data(seqs_x, seqs_y=None, cuda=Constants.USE_GPU, bt_attrib=bt_attrib) try: with torch.no_grad(): encoder_hidden, mask = nmt_model(x) except RuntimeError as e: if 'out of memory' in str(e): print('| WARNING: ran out of memory, skipping batch') else: raise e valid_hidden = (mask == False).float().cuda() sum_encoder_hidden = (encoder_hidden * valid_hidden.unsqueeze(-1)).sum(dim=1) valid_tokens = (mask == False).sum(-1) mean_encoder_hidden = sum_encoder_hidden.float().div(valid_tokens.unsqueeze(1)) if all_mean_encoder_hidden is None: all_mean_encoder_hidden = mean_encoder_hidden else: all_mean_encoder_hidden = torch.cat((all_mean_encoder_hidden, mean_encoder_hidden), dim=0) return all_mean_encoder_hidden, all_seq_numbers
def train2(flags): """ flags: saveto: str reload: store_true config_path: str pretrain_path: str, default="" model_name: str log_path: str """ # ================================================================================== # # Initialization for training on different devices # - CPU/GPU # - Single/Distributed Constants.USE_GPU = flags.use_gpu world_size = 1 rank = 0 local_rank = 0 if Constants.USE_GPU: torch.cuda.set_device(local_rank) Constants.CURRENT_DEVICE = "cuda:{0}".format(local_rank) else: Constants.CURRENT_DEVICE = "cpu" # ================================================================================== # # Parsing configuration files # - Load default settings # - Load pre-defined settings # - Load user-defined settings configs = prepare_configs(flags.config_path, flags.predefined_config) data_configs = configs['data_configs'] model_configs = configs['model_configs'] training_configs = configs['training_configs'] bt_configs = configs['bt_configs'] if 'bt_configs' in configs else None if bt_configs is not None: print("btconfigs ", bt_configs) if 'bt_attribute_data' not in bt_configs: Constants.USE_BT = False bt_configs = None else: Constants.USE_BT = True Constants.USE_BTTAG = bt_configs['use_bttag'] Constants.USE_CONFIDENCE = bt_configs['use_confidence'] INFO(pretty_configs(configs)) Constants.SEED = training_configs['seed'] set_seed(Constants.SEED) timer = Timer() # ================================================================================== # # Load Data INFO('Loading data...') timer.tic() # Generate target dictionary vocab_src = Vocabulary.build_from_file(**data_configs['vocabularies'][0]) vocab_tgt = Vocabulary.build_from_file(**data_configs['vocabularies'][1]) Constants.EOS = vocab_src.eos Constants.PAD = vocab_src.pad Constants.BOS = vocab_src.bos # bt tag dataset if Constants.USE_BT: if Constants.USE_BTTAG: Constants.BTTAG = vocab_src.bttag train_bitext_dataset = ZipDataset( TextLineDataset(data_path=data_configs['train_data'][0], vocabulary=vocab_src, max_len=data_configs['max_len'][0], is_train_dataset=True ), TextLineDataset(data_path=data_configs['train_data'][1], vocabulary=vocab_tgt, max_len=data_configs['max_len'][1], is_train_dataset=True ), AttributeDataset(data_path=bt_configs['bt_attribute_data'], is_train_dataset=True) ) else: train_bitext_dataset = ZipDataset( TextLineDataset(data_path=data_configs['train_data'][0], vocabulary=vocab_src, max_len=data_configs['max_len'][0], is_train_dataset=True ), TextLineDataset(data_path=data_configs['train_data'][1], vocabulary=vocab_tgt, max_len=data_configs['max_len'][1], is_train_dataset=True ) ) training_iterator = DataIterator(dataset=train_bitext_dataset, batch_size=training_configs["batch_size"], use_bucket=training_configs['use_bucket'], buffer_size=training_configs['buffer_size'], batching_func=training_configs['batching_key'], world_size=world_size, numbering=True, rank=rank) INFO('Done. Elapsed time {0}'.format(timer.toc())) # ================================ Begin ======================================== # # Build Model & Optimizer # We would do steps below on after another # 1. build models & criterion # 2. move models & criterion to gpu if needed # 3. load pre-trained model if needed # 4. build optimizer # 5. build learning rate scheduler if needed # 6. load checkpoints if needed # 0. Initial # 1. Build Model & Criterion INFO('Building model...') timer.tic() nmt_model = build_model(n_src_vocab=vocab_src.max_n_words, n_tgt_vocab=vocab_tgt.max_n_words, padding_idx=vocab_src.pad, vocab_src=vocab_src, vocab_tgt=vocab_tgt, **model_configs) INFO(nmt_model) # 2. Move to GPU if Constants.USE_GPU: nmt_model = nmt_model.cuda() # 3. Load pretrained model if needed load_pretrained_model(nmt_model, flags.pretrain_path, exclude_prefix=flags.pretrain_exclude_prefix, device=Constants.CURRENT_DEVICE) nmt_model = nmt_model.encoder INFO('Done. Elapsed time {0}'.format(timer.toc())) INFO('Begin training...') # 计算train集合每个句子的表示:mean pool training_iter = training_iterator.build_generator() nmt_model.eval() all_seq_numbers = [] encoder_filename = "/home/wangdq/encoder.mean.output" seq_numbers_filename = '/home/wangdq/seq_numbers.output' processd = 0 with open(encoder_filename, 'w') as f_encoder, open(seq_numbers_filename, 'w') as f_seq_numbers: for batch in training_iter: bt_attrib = None # bt attrib data if Constants.USE_BT: seq_numbers, seqs_x, seqs_y, bt_attrib = batch # seq_numerbs从0开始编号 else: seq_numbers, seqs_x, seqs_y = batch x = prepare_data(seqs_x, seqs_y=None, cuda=Constants.USE_GPU, bt_attrib=bt_attrib) try: with torch.no_grad(): encoder_hidden, mask = nmt_model(x) except RuntimeError as e: if 'out of memory' in str(e): print('| WARNING: ran out of memory, skipping batch') else: raise e valid_hidden = (mask == False).float().cuda() sum_encoder_hidden = (encoder_hidden * valid_hidden.unsqueeze(-1)).sum(dim=1) valid_tokens = (mask == False).sum(-1) mean_encoder_hidden = sum_encoder_hidden.float().div(valid_tokens.unsqueeze(1)) all_seq_numbers.extend(seq_numbers) # if all_mean_encoder_hidden is None: # all_mean_encoder_hidden = mean_encoder_hidden.cpu() # else: # all_mean_encoder_hidden = torch.cat((all_mean_encoder_hidden, mean_encoder_hidden.cpu()), dim=0) mean_encoder_list = mean_encoder_hidden.cpu().numpy().tolist() content = [[str(i) for i in mean] for mean in mean_encoder_list] content = [' '.join(mean) + '\n' for mean in content] f_encoder.writelines(content) processd += len(seq_numbers) print(processd) content = [str(i) for i in all_seq_numbers] content = ' '.join(content) f_seq_numbers.writelines(content)
def __init__(self, n_src_vocab, n_tgt_vocab, n_layers=6, n_head=8, d_word_vec=512, d_model=512, d_inner_hid=1024, dim_per_head=None, dropout=0.1, tie_input_output_embedding=True, tie_source_target_embedding=False, padding_idx=PAD, layer_norm_first=True, positional_embedding="sin", generator_bias=False, ffn_activation="relu", vocab_src=None, **kwargs): super(Transformer_Char, self).__init__() self.char_vocab = Vocabulary.build_from_file(**kwargs['char_vocab']) self.encoder = Encoder(n_src_vocab, char_src_vocab=self.char_vocab.max_n_words, n_layers=n_layers, n_head=n_head, d_word_vec=d_word_vec, d_model=d_model, d_inner_hid=d_inner_hid, dropout=dropout, dim_per_head=dim_per_head, padding_idx=padding_idx, layer_norm_first=layer_norm_first, positional_embedding=positional_embedding, ffn_activation=ffn_activation) self.decoder = Decoder( n_tgt_vocab, n_layers=n_layers, n_head=n_head, d_word_vec=d_word_vec, d_model=d_model, d_inner_hid=d_inner_hid, dropout=dropout, dim_per_head=dim_per_head, padding_idx=padding_idx, layer_norm_first=layer_norm_first, positional_embedding=positional_embedding, ffn_activation=ffn_activation, ) self.dropout = nn.Dropout(dropout) assert d_model == d_word_vec, \ 'To facilitate the residual connections, \ the dimensions of all module output shall be the same.' if tie_source_target_embedding: assert n_src_vocab == n_tgt_vocab, \ "source and target vocabulary should have equal size when tying source&target embedding" self.encoder.embeddings.embeddings.weight = self.decoder.embeddings.embeddings.weight if tie_input_output_embedding: self.generator = Generator( n_words=n_tgt_vocab, hidden_size=d_word_vec, shared_weight=self.decoder.embeddings.embeddings.weight, padding_idx=PAD, add_bias=generator_bias) else: self.generator = Generator(n_words=n_tgt_vocab, hidden_size=d_word_vec, padding_idx=PAD, add_bias=generator_bias) self.bpe_vocab = vocab_src
def train(flags): """ flags: saveto: str reload: store_true config_path: str pretrain_path: str, default="" model_name: str log_path: str """ # ================================================================================== # # Initialization for training on different devices # - CPU/GPU # - Single/Distributed Constants.USE_GPU = flags.use_gpu if flags.multi_gpu: dist.distributed_init(flags.shared_dir) world_size = dist.get_world_size() rank = dist.get_rank() local_rank = dist.get_local_rank() else: world_size = 1 rank = 0 local_rank = 0 if Constants.USE_GPU: torch.cuda.set_device(local_rank) Constants.CURRENT_DEVICE = "cuda:{0}".format(local_rank) else: Constants.CURRENT_DEVICE = "cpu" # If not root_rank, close logging # else write log of training to file. if rank == 0: write_log_to_file( os.path.join(flags.log_path, "%s.log" % time.strftime("%Y%m%d-%H%M%S"))) else: close_logging() # ================================================================================== # # Parsing configuration files # - Load default settings # - Load pre-defined settings # - Load user-defined settings configs = prepare_configs(flags.config_path, flags.predefined_config) data_configs = configs['data_configs'] model_configs = configs['model_configs'] optimizer_configs = configs['optimizer_configs'] training_configs = configs['training_configs'] INFO(pretty_configs(configs)) # use odc if training_configs['use_odc'] is True: ave_best_k = check_odc_config(training_configs) else: ave_best_k = 0 Constants.SEED = training_configs['seed'] set_seed(Constants.SEED) timer = Timer() # ================================================================================== # # Load Data INFO('Loading data...') timer.tic() # Generate target dictionary vocab_src = Vocabulary.build_from_file(**data_configs['vocabularies'][0]) vocab_tgt = Vocabulary.build_from_file(**data_configs['vocabularies'][1]) Constants.EOS = vocab_src.eos Constants.PAD = vocab_src.pad Constants.BOS = vocab_src.bos train_bitext_dataset = ZipDataset( TextLineDataset(data_path=data_configs['train_data'][0], vocabulary=vocab_src, max_len=data_configs['max_len'][0], is_train_dataset=True), TextLineDataset(data_path=data_configs['train_data'][1], vocabulary=vocab_tgt, max_len=data_configs['max_len'][1], is_train_dataset=True)) valid_bitext_dataset = ZipDataset( TextLineDataset( data_path=data_configs['valid_data'][0], vocabulary=vocab_src, is_train_dataset=False, ), TextLineDataset(data_path=data_configs['valid_data'][1], vocabulary=vocab_tgt, is_train_dataset=False)) training_iterator = DataIterator( dataset=train_bitext_dataset, batch_size=training_configs["batch_size"], use_bucket=training_configs['use_bucket'], buffer_size=training_configs['buffer_size'], batching_func=training_configs['batching_key'], world_size=world_size, rank=rank) valid_iterator = DataIterator( dataset=valid_bitext_dataset, batch_size=training_configs['valid_batch_size'], use_bucket=True, buffer_size=100000, numbering=True, world_size=world_size, rank=rank) bleu_scorer = SacreBLEUScorer( reference_path=data_configs["bleu_valid_reference"], num_refs=data_configs["num_refs"], lang_pair=data_configs["lang_pair"], sacrebleu_args=training_configs["bleu_valid_configs"] ['sacrebleu_args'], postprocess=training_configs["bleu_valid_configs"]['postprocess']) INFO('Done. Elapsed time {0}'.format(timer.toc())) # ================================ Begin ======================================== # # Build Model & Optimizer # We would do steps below on after another # 1. build models & criterion # 2. move models & criterion to gpu if needed # 3. load pre-trained model if needed # 4. build optimizer # 5. build learning rate scheduler if needed # 6. load checkpoints if needed # 0. Initial lrate = optimizer_configs['learning_rate'] model_collections = Collections() checkpoint_saver = Saver( save_prefix="{0}.ckpt".format( os.path.join(flags.saveto, flags.model_name)), num_max_keeping=training_configs['num_kept_checkpoints']) best_model_prefix = os.path.join( flags.saveto, flags.model_name + Constants.MY_BEST_MODEL_SUFFIX) best_k_saver = BestKSaver( save_prefix="{0}.best_k_ckpt".format( os.path.join(flags.saveto, flags.model_name)), num_max_keeping=training_configs['num_kept_best_k_checkpoints']) # 1. Build Model & Criterion INFO('Building model...') timer.tic() nmt_model = build_model(n_src_vocab=vocab_src.max_n_words, n_tgt_vocab=vocab_tgt.max_n_words, padding_idx=vocab_src.pad, vocab_src=vocab_src, **model_configs) INFO(nmt_model) # build teacher model teacher_model, teacher_model_path = get_teacher_model( training_configs, model_configs, vocab_src, vocab_tgt, flags) # build critic critic = CombinationCriterion(model_configs['loss_configs'], padding_idx=vocab_tgt.pad, teacher=teacher_model) # INFO(critic) critic.INFO() # 2. Move to GPU if Constants.USE_GPU: nmt_model = nmt_model.cuda() critic = critic.cuda() # 3. Load pretrained model if needed load_pretrained_model(nmt_model, flags.pretrain_path, exclude_prefix=None, device=Constants.CURRENT_DEVICE) INFO('Done. Elapsed time {0}'.format(timer.toc())) # 4. Build optimizer INFO('Building Optimizer...') if not flags.multi_gpu: optim = Optimizer(name=optimizer_configs['optimizer'], model=nmt_model, lr=lrate, grad_clip=optimizer_configs['grad_clip'], optim_args=optimizer_configs['optimizer_params'], update_cycle=training_configs['update_cycle']) else: optim = dist.DistributedOptimizer( name=optimizer_configs['optimizer'], model=nmt_model, lr=lrate, grad_clip=optimizer_configs['grad_clip'], optim_args=optimizer_configs['optimizer_params'], device_id=local_rank) # 5. Build scheduler for optimizer if needed scheduler = build_scheduler( schedule_method=optimizer_configs['schedule_method'], optimizer=optim, scheduler_configs=optimizer_configs['scheduler_configs']) # 6. build moving average ma = build_ma(training_configs, nmt_model.named_parameters()) INFO('Done. Elapsed time {0}'.format(timer.toc())) # Reload from latest checkpoint if flags.reload: checkpoint_saver.load_latest(model=nmt_model, optim=optim, lr_scheduler=scheduler, collections=model_collections, ma=ma, device=Constants.CURRENT_DEVICE) # broadcast parameters and optimizer states if world_size > 1: INFO("Broadcasting model parameters...") dist.broadcast_parameters(params=nmt_model.state_dict()) INFO("Broadcasting optimizer states...") dist.broadcast_optimizer_state(optimizer=optim.optim) INFO('Done.') # ================================================================================== # # Prepare training eidx = model_collections.get_collection("eidx", [0])[-1] uidx = model_collections.get_collection("uidx", [1])[-1] bad_count = model_collections.get_collection("bad_count", [0])[-1] oom_count = model_collections.get_collection("oom_count", [0])[-1] is_early_stop = model_collections.get_collection("is_early_stop", [ False, ])[-1] teacher_patience = model_collections.get_collection( "teacher_patience", [training_configs['teacher_patience']])[-1] train_loss_meter = AverageMeter() train_loss_dict_meter = AverageMeterDict(critic.get_critic_name()) sent_per_sec_meter = TimeMeter() tok_per_sec_meter = TimeMeter() update_cycle = training_configs['update_cycle'] grad_denom = 0 train_loss = 0.0 cum_n_words = 0 train_loss_dict = dict() valid_loss = best_valid_loss = float('inf') if rank == 0: summary_writer = SummaryWriter(log_dir=flags.log_path) else: summary_writer = None sent_per_sec_meter.start() tok_per_sec_meter.start() INFO('Begin training...') while True: if summary_writer is not None: summary_writer.add_scalar("Epoch", (eidx + 1), uidx) # Build iterator and progress bar training_iter = training_iterator.build_generator() if rank == 0: training_progress_bar = tqdm(desc=' - (Epc {}, Upd {}) '.format( eidx, uidx), total=len(training_iterator), unit="sents") else: training_progress_bar = None for batch in training_iter: seqs_x, seqs_y = batch batch_size = len(seqs_x) cum_n_words += sum(len(s) for s in seqs_y) try: # Prepare data x, y = prepare_data(seqs_x, seqs_y, cuda=Constants.USE_GPU) loss, loss_dict = compute_forward( model=nmt_model, critic=critic, seqs_x=x, seqs_y=y, eval=False, normalization=1.0, norm_by_words=training_configs["norm_by_words"]) update_cycle -= 1 grad_denom += batch_size train_loss += loss train_loss_dict = add_dict_value(train_loss_dict, loss_dict) except RuntimeError as e: if 'out of memory' in str(e): print('| WARNING: ran out of memory, skipping batch') oom_count += 1 else: raise e # When update_cycle becomes 0, it means end of one batch. Several things will be done: # - update parameters # - reset update_cycle and grad_denom, update uidx # - learning rate scheduling # - update moving average if update_cycle == 0: # 0. reduce variables if world_size > 1: grad_denom = dist.all_reduce_py(grad_denom) train_loss = dist.all_reduce_py(train_loss) train_loss_dict = dist.all_reduce_py(train_loss_dict) cum_n_words = dist.all_reduce_py(cum_n_words) # 1. update parameters optim.step(denom=grad_denom) optim.zero_grad() if training_progress_bar is not None: training_progress_bar.update(grad_denom) training_progress_bar.set_description( ' - (Epc {}, Upd {}) '.format(eidx, uidx)) postfix_str = 'TrainLoss: {:.2f}, ValidLoss(best): {:.2f} ({:.2f}), '.format( train_loss, valid_loss, best_valid_loss) for critic_name, loss_value in train_loss_dict.items(): postfix_str += (critic_name + ': {:.2f}, ').format(loss_value) training_progress_bar.set_postfix_str(postfix_str) # 2. learning rate scheduling if scheduler is not None and optimizer_configs[ "schedule_method"] != "loss": scheduler.step(global_step=uidx) # 3. update moving average if ma is not None and eidx >= training_configs[ 'moving_average_start_epoch']: ma.step() # 4. update meters train_loss_meter.update(train_loss, grad_denom) train_loss_dict_meter.update(train_loss_dict, grad_denom) sent_per_sec_meter.update(grad_denom) tok_per_sec_meter.update(cum_n_words) # 5. reset accumulated variables, update uidx update_cycle = training_configs['update_cycle'] grad_denom = 0 uidx += 1 cum_n_words = 0.0 train_loss = 0.0 train_loss_dict = dict() else: continue # ================================================================================== # # Display some information if should_trigger_by_steps( uidx, eidx, every_n_step=training_configs['disp_freq']): lrate = list(optim.get_lrate())[0] if summary_writer is not None: summary_writer.add_scalar( "Speed(sents/sec)", scalar_value=sent_per_sec_meter.ave, global_step=uidx) summary_writer.add_scalar( "Speed(words/sec)", scalar_value=tok_per_sec_meter.ave, global_step=uidx) summary_writer.add_scalar( "train_loss", scalar_value=train_loss_meter.ave, global_step=uidx) # add loss for every critic if flags.display_loss_detail: combination_loss = train_loss_dict_meter.value for key, value in combination_loss.items(): summary_writer.add_scalar(key, scalar_value=value, global_step=uidx) summary_writer.add_scalar("lrate", scalar_value=lrate, global_step=uidx) summary_writer.add_scalar("oom_count", scalar_value=oom_count, global_step=uidx) # Reset Meters sent_per_sec_meter.reset() tok_per_sec_meter.reset() train_loss_meter.reset() train_loss_dict_meter.reset() # ================================================================================== # # Loss Validation & Learning rate annealing if should_trigger_by_steps( global_step=uidx, n_epoch=eidx, every_n_step=training_configs['loss_valid_freq'], debug=flags.debug): with cache_parameters(nmt_model): valid_loss, valid_loss_dict = loss_evaluation( model=nmt_model, critic=critic, valid_iterator=valid_iterator, rank=rank, world_size=world_size) if scheduler is not None and optimizer_configs[ "schedule_method"] == "loss": scheduler.step(metric=valid_loss) model_collections.add_to_collection("history_losses", valid_loss) min_history_loss = np.array( model_collections.get_collection("history_losses")).min() best_valid_loss = min_history_loss if summary_writer is not None: summary_writer.add_scalar("loss", valid_loss, global_step=uidx) summary_writer.add_scalar("best_loss", min_history_loss, global_step=uidx) # ================================================================================== # # BLEU Validation & Early Stop if should_trigger_by_steps( global_step=uidx, n_epoch=eidx, every_n_step=training_configs['bleu_valid_freq'], min_step=training_configs['bleu_valid_warmup'], debug=flags.debug): with cache_parameters(nmt_model): valid_bleu = bleu_evaluation( uidx=uidx, valid_iterator=valid_iterator, batch_size=training_configs["bleu_valid_batch_size"], model=nmt_model, bleu_scorer=bleu_scorer, vocab_src=vocab_src, vocab_tgt=vocab_tgt, valid_dir=flags.valid_path, max_steps=training_configs["bleu_valid_configs"] ["max_steps"], beam_size=training_configs["bleu_valid_configs"] ["beam_size"], alpha=training_configs["bleu_valid_configs"]["alpha"], world_size=world_size, rank=rank, ) model_collections.add_to_collection(key="history_bleus", value=valid_bleu) best_valid_bleu = float( np.array(model_collections.get_collection( "history_bleus")).max()) if summary_writer is not None: summary_writer.add_scalar("bleu", valid_bleu, uidx) summary_writer.add_scalar("best_bleu", best_valid_bleu, uidx) # If model get new best valid bleu score if valid_bleu >= best_valid_bleu: bad_count = 0 if is_early_stop is False: if rank == 0: # 1. save the best model torch.save(nmt_model.state_dict(), best_model_prefix + ".final") else: bad_count += 1 # At least one epoch should be traversed if bad_count >= training_configs[ 'early_stop_patience'] and eidx > 0: is_early_stop = True WARN("Early Stop!") exit(0) if rank == 0: best_k_saver.save(global_step=uidx, metric=valid_bleu, model=nmt_model, optim=optim, lr_scheduler=scheduler, collections=model_collections, ma=ma) # ODC if training_configs['use_odc'] is True: if valid_bleu >= best_valid_bleu: pass # choose method to generate teachers from checkpoints # - best # - ave_k_best # - ma if training_configs['teacher_choice'] == 'ma': teacher_params = ma.export_ma_params() elif training_configs['teacher_choice'] == 'best': teacher_params = nmt_model.state_dict() elif "ave_best" in training_configs['teacher_choice']: if best_k_saver.num_saved >= ave_best_k: teacher_params = average_checkpoints( best_k_saver.get_all_ckpt_path() [-ave_best_k:]) else: teacher_params = nmt_model.state_dict() else: raise ValueError( "can not support teacher choice %s" % training_configs['teacher_choice']) torch.save(teacher_params, teacher_model_path) del teacher_params teacher_patience = 0 critic.set_use_KD(False) else: teacher_patience += 1 if teacher_patience >= training_configs[ 'teacher_refresh_warmup']: teacher_params = torch.load( teacher_model_path, map_location=Constants.CURRENT_DEVICE) teacher_model.load_state_dict(teacher_params, strict=False) del teacher_params critic.reset_teacher(teacher_model) critic.set_use_KD(True) if summary_writer is not None: summary_writer.add_scalar("bad_count", bad_count, uidx) info_str = "{0} Loss: {1:.2f} BLEU: {2:.2f} lrate: {3:6f} patience: {4} ".format( uidx, valid_loss, valid_bleu, lrate, bad_count) for key, value in valid_loss_dict.items(): info_str += (key + ': {0:.2f} '.format(value)) INFO(info_str) # ================================================================================== # # Saving checkpoints if should_trigger_by_steps( uidx, eidx, every_n_step=training_configs['save_freq'], debug=flags.debug): model_collections.add_to_collection("uidx", uidx) model_collections.add_to_collection("eidx", eidx) model_collections.add_to_collection("bad_count", bad_count) model_collections.add_to_collection("teacher_patience", teacher_patience) if not is_early_stop: if rank == 0: checkpoint_saver.save(global_step=uidx, model=nmt_model, optim=optim, lr_scheduler=scheduler, collections=model_collections, ma=ma) if training_progress_bar is not None: training_progress_bar.close() eidx += 1 if eidx > training_configs["max_epochs"]: break
def main(): opts = parse_args() init_logging( os.path.join(opts.log_dir, '{:s}_win0_win4_log_test.txt'.format(opts.task))) if torch.cuda.is_available(): torch.cuda.set_device(opts.gpu) logging.info("Using GPU!") device = "cuda" else: logging.info("Using CPU!") device = "cpu" logging.info(opts) test_datasets = PhoenixVideo(opts.vocab_file, opts.corpus_dir, opts.video_path, phase=opts.task, DEBUG=opts.DEBUG) vocab_size = test_datasets.vocab.num_words blank_id = test_datasets.vocab.word2index['<BLANK>'] vocabulary = Vocabulary(opts.vocab_file) # model = DilatedSLRNet(opts, device, vocab_size, vocabulary, # dilated_channels=512, num_blocks=5, dilations=[1, 2, 4], dropout=0.0) model = MainStream(vocab_size) criterion = CtcLoss(opts, blank_id, device, reduction="none") trainer = Trainer(opts, model, criterion, vocabulary, vocab_size, blank_id) # ctcdeocde ctc_decoder_vocab = [chr(x) for x in range(20000, 20000 + vocab_size)] ctc_decoder = ctcdecode.CTCBeamDecoder(ctc_decoder_vocab, beam_width=opts.beam_width, blank_id=blank_id, num_processes=10) if os.path.exists(opts.check_point): logging.info("Loading checkpoint file from {}".format( opts.check_point)) epoch, num_updates, loss = trainer.load_checkpoint(opts.check_point) else: logging.info("No checkpoint file in found in {}".format( opts.check_point)) epoch, num_updates, loss = 0, 0, 0.0 test_iter = trainer.get_batch_iterator(test_datasets, batch_size=opts.batch_size, shuffle=False) decoded_dict = {} val_err, val_correct, val_count = np.zeros([4]), 0, 0 with open("Data/output/hypo_ctc.txt", "w") as f, open("Data/output/ref_ctc.txt", "w") as f2: with torch.no_grad(): model.eval() criterion.eval() for samples in tqdm(test_iter): samples = trainer._prepare_sample(samples) video = samples["data"] len_video = samples["len_data"] label = samples["label"] len_label = samples["len_label"] video_id = samples['id'] logits, _ = model(video, len_video) len_video /= 4 logits = F.softmax(logits, dim=-1) pred_seq, _, _, out_seq_len = ctc_decoder.decode( logits, len_video) start = 0 for i, length in enumerate(len_label): end = start + length ref = label[start:end].tolist() hyp = [ x[0] for x in groupby(pred_seq[i][0] [:out_seq_len[i][0]].tolist()) ] ref_sent = " ".join( [vocabulary.index2word[r] for r in ref]) hyp_sent = " ".join( [vocabulary.index2word[r] for r in hyp]) f.write(hyp_sent + "\n") f2.write(ref_sent + "\n") decoded_dict[video_id[i]] = hyp val_correct += int(ref == hyp) err = get_wer_delsubins(ref, hyp) val_err += np.array(err) val_count += 1 start = end assert end == label.size(0) logging.info('-' * 50) logging.info('Epoch: {:d}, DEV ACC: {:.5f}, {:d}/{:d}'.format( epoch, val_correct / val_count, val_correct, val_count)) logging.info( 'Epoch: {:d}, DEV WER: {:.5f}, SUB: {:.5f}, INS: {:.5f}, DEL: {:.5f}' .format(epoch, val_err[0] / val_count, val_err[1] / val_count, val_err[2] / val_count, val_err[3] / val_count)) list_str_for_test = [] for k, v in decoded_dict.items(): start_time = 0 for wi in v: tl = np.random.random() * 0.1 list_str_for_test.append('{} 1 {:.3f} {:.3f} {}\n'.format( k, start_time, start_time + tl, test_datasets.vocab.index2word[wi])) start_time += tl tmp_prefix = str(uuid.uuid1()) txt_file = '{:s}.txt'.format(tmp_prefix) result_file = os.path.join('evaluation_relaxation', txt_file) with open(result_file, 'w') as fid: fid.writelines(list_str_for_test) phoenix_eval_err = get_phoenix_wer(txt_file, opts.task, tmp_prefix) logging.info( '[Relaxation Evaluation] Epoch: {:d}, DEV WER: {:.5f}, SUB: {:.5f}, INS: {:.5f}, DEL: {:.5f}' .format(epoch, phoenix_eval_err[0], phoenix_eval_err[1], phoenix_eval_err[2], phoenix_eval_err[3])) return phoenix_eval_err
early_exit=self.early_exit[0], layers=self.layers_del, **unused) decoder_out = F.linear(features, self.embed_word_del.weight) if normalize: return F.log_softmax(decoder_out, -1), extra['attn'] return decoder_out, extra['attn'] def Embedding(num_embeddings, embedding_dim, padding_idx): m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5) nn.init.constant_(m.weight[padding_idx], 0) return m if __name__ == "__main__": from config import options from src.data.vocabulary import Vocabulary opts = options.parse_args() vocabulary = Vocabulary(opts.vocab_file) decoder = LevenshteinTransformerDecoder(opts, vocabulary) encoder_out = torch.randn(2, 10, 512) out = decoder() print(decoder)
def initial_random_perturb(config_path, inputs, w2p, w2vocab, mode="len_based", key_type="token", show_bleu=False): """ batched random perturb, perturb is based on random probability from the collected candidates meant to test initial attack rate. :param config_path: victim configs :param inputs: raw batched input (list) sequences in [batch_size, seq_len] :param w2p: indicates how likely a word is perturbed :param w2vocab: near candidates :param mode: based on word2near_vocab, how to distribute likelihood among candidates :param key_type: inputs are given by raw sequences of tokens or tokenized labels :param show_bleu: whether to show bleu of perturbed seqs (compare to original seqs) :return: list of perturbed inputs and list of perturbed flags """ np.random.seed(int(time.time())) assert mode in ["uniform", "len_based"], "Mode must be in uniform or multinomial." assert key_type in ["token", "label"], "inputs key type must be token or label." # load configs with open(config_path.strip()) as f: configs = yaml.load(f) data_configs = configs["data_configs"] # load vocabulary file and tokenize src_vocab = Vocabulary(**data_configs["vocabularies"][0]) perturbed_results = [] flags = [] for sent in inputs: if np.random.uniform() < 0.5: # perturb the sentence perturbed_sent = [] if key_type == "token": tokenized_sent = src_vocab.tokenizer.tokenize(sent) for word in tokenized_sent: if np.random.uniform() < w2p[word]: # need to perturb on lexical level if mode == "uniform": # uniform choose from candidates: perturbed_sent += [w2vocab[word][np.random.choice(len(w2vocab[word]), 1)[0]]] elif mode == "len_based": # weighted choose from candidates: weights = [1./(1+abs(len(word)-len(c))) for c in w2vocab[word]] norm_weights = [c/sum(weights) for c in weights] perturbed_sent += [w2vocab[word][np.random.choice(len(w2vocab[word]), 1, p=norm_weights )[0]]] else: perturbed_sent += [word] # print(perturbed_sent) # yield same form of sequences of tokens perturbed_sent = src_vocab.tokenizer.detokenize(perturbed_sent) elif key_type == "label": # tokenized labels for word_index in sent: word = src_vocab.id2token(word_index) if np.random.uniform() < w2p[word]: if mode == "uniform": # uniform choose from candidates: perturbed_label = src_vocab.token2id(w2vocab[word][np.random.choice( len(w2vocab[word]), 1 )[0]]) perturbed_sent += [perturbed_label] elif mode == "len_based": # weighted choose from candidates: weights = [1. / (1 + abs(len(word) - len(c))) for c in w2vocab[word]] norm_weights = [c / sum(weights) for c in weights] perturbed_label = src_vocab.token2id(w2vocab[word][np.random.choice(len(w2vocab[word]), 1, p=norm_weights )[0]]) perturbed_sent += [perturbed_label] else: perturbed_sent += [word_index] perturbed_results += [perturbed_sent] flags += [1] # out.write(perturbed_sent + "\n") else: perturbed_results += [sent] flags += [0] return perturbed_results, flags
def ensemble_translate(FLAGS): GlobalNames.USE_GPU = FLAGS.use_gpu config_path = os.path.abspath(FLAGS.config_path) with open(config_path.strip()) as f: configs = yaml.load(f) data_configs = configs['data_configs'] model_configs = configs['model_configs'] timer = Timer() # ================================================================================== # # Load Data INFO('Loading data...') timer.tic() # Generate target dictionary vocab_src = Vocabulary(**data_configs["vocabularies"][0]) vocab_tgt = Vocabulary(**data_configs["vocabularies"][1]) valid_dataset = TextLineDataset(data_path=FLAGS.source_path, vocabulary=vocab_src) valid_iterator = DataIterator(dataset=valid_dataset, batch_size=FLAGS.batch_size, use_bucket=True, buffer_size=100000, numbering=True) INFO('Done. Elapsed time {0}'.format(timer.toc())) # ================================================================================== # # Build Model & Sampler & Validation INFO('Building model...') timer.tic() nmt_models = [] model_path = FLAGS.model_path for ii in range(len(model_path)): nmt_model = build_model(n_src_vocab=vocab_src.max_n_words, n_tgt_vocab=vocab_tgt.max_n_words, **model_configs) nmt_model.eval() INFO('Done. Elapsed time {0}'.format(timer.toc())) INFO('Reloading model parameters...') timer.tic() params = load_model_parameters(model_path[ii], map_location="cpu") nmt_model.load_state_dict(params) if GlobalNames.USE_GPU: nmt_model.cuda() nmt_models.append(nmt_model) INFO('Done. Elapsed time {0}'.format(timer.toc())) INFO('Begin...') result_numbers = [] result = [] n_words = 0 timer.tic() infer_progress_bar = tqdm(total=len(valid_iterator), desc=' - (Infer) ', unit="sents") valid_iter = valid_iterator.build_generator() for batch in valid_iter: numbers, seqs_x = batch batch_size_t = len(seqs_x) x = prepare_data(seqs_x=seqs_x, cuda=GlobalNames.USE_GPU) with torch.no_grad(): word_ids = ensemble_beam_search(nmt_models=nmt_models, beam_size=FLAGS.beam_size, max_steps=FLAGS.max_steps, src_seqs=x, alpha=FLAGS.alpha) word_ids = word_ids.cpu().numpy().tolist() # Append result for sent_t in word_ids: sent_t = [[wid for wid in line if wid != PAD] for line in sent_t] result.append(sent_t) n_words += len(sent_t[0]) infer_progress_bar.update(batch_size_t) infer_progress_bar.close() INFO('Done. Speed: {0:.2f} words/sec'.format( n_words / (timer.toc(return_seconds=True)))) translation = [] for sent in result: samples = [] for trans in sent: sample = [] for w in trans: if w == vocab_tgt.EOS: break sample.append(vocab_tgt.id2token(w)) samples.append(vocab_tgt.tokenizer.detokenize(sample)) translation.append(samples) # resume the ordering origin_order = np.argsort(result_numbers).tolist() translation = [translation[ii] for ii in origin_order] keep_n = FLAGS.beam_size if FLAGS.keep_n <= 0 else min( FLAGS.beam_size, FLAGS.keep_n) outputs = ['%s.%d' % (FLAGS.saveto, i) for i in range(keep_n)] with batch_open(outputs, 'w') as handles: for trans in translation: for i in range(keep_n): if i < len(trans): handles[i].write('%s\n' % trans[i]) else: handles[i].write('%s\n' % 'eos')
def tune(flags): """ flags: saveto: str reload: store_true config_path: str pretrain_path: str, default="" model_name: str log_path: str """ # ================================================================================== # # Initialization for training on different devices # - CPU/GPU # - Single/Distributed Constants.USE_GPU = flags.use_gpu if flags.multi_gpu: dist.distributed_init(flags.shared_dir) world_size = dist.get_world_size() rank = dist.get_rank() local_rank = dist.get_local_rank() else: world_size = 1 rank = 0 local_rank = 0 if Constants.USE_GPU: torch.cuda.set_device(local_rank) Constants.CURRENT_DEVICE = "cuda:{0}".format(local_rank) else: Constants.CURRENT_DEVICE = "cpu" # If not root_rank, close logging # else write log of training to file. if rank == 0: write_log_to_file( os.path.join(flags.log_path, "%s.log" % time.strftime("%Y%m%d-%H%M%S"))) else: close_logging() # ================================================================================== # # Parsing configuration files # - Load default settings # - Load pre-defined settings # - Load user-defined settings configs = prepare_configs(flags.config_path, flags.predefined_config) data_configs = configs['data_configs'] model_configs = configs['model_configs'] optimizer_configs = configs['optimizer_configs'] training_configs = configs['training_configs'] INFO(pretty_configs(configs)) Constants.SEED = training_configs['seed'] set_seed(Constants.SEED) timer = Timer() # ================================================================================== # # Load Data INFO('Loading data...') timer.tic() # Generate target dictionary vocab_src = Vocabulary.build_from_file(**data_configs['vocabularies'][0]) vocab_tgt = Vocabulary.build_from_file(**data_configs['vocabularies'][1]) Constants.EOS = vocab_src.eos Constants.PAD = vocab_src.pad Constants.BOS = vocab_src.bos # bt tag dataset train_bitext_dataset = ZipDataset( TextLineDataset(data_path=data_configs['train_data'][0], vocabulary=vocab_src, max_len=data_configs['max_len'][0], is_train_dataset=True), TextLineDataset(data_path=data_configs['train_data'][1], vocabulary=vocab_tgt, max_len=data_configs['max_len'][1], is_train_dataset=True)) training_iterator = DataIterator( dataset=train_bitext_dataset, batch_size=training_configs["batch_size"], use_bucket=training_configs['use_bucket'], buffer_size=training_configs['buffer_size'], batching_func=training_configs['batching_key'], world_size=world_size, rank=rank) INFO('Done. Elapsed time {0}'.format(timer.toc())) # ================================ Begin ======================================== # # Build Model & Optimizer # We would do steps below on after another # 1. build models & criterion # 2. move models & criterion to gpu if needed # 3. load pre-trained model if needed # 4. build optimizer # 5. build learning rate scheduler if needed # 6. load checkpoints if needed # 0. Initial lrate = optimizer_configs['learning_rate'] model_collections = Collections() checkpoint_saver = Saver( save_prefix="{0}.ckpt".format( os.path.join(flags.saveto, flags.model_name)), num_max_keeping=training_configs['num_kept_checkpoints']) best_model_prefix = os.path.join( flags.saveto, flags.model_name + Constants.MY_BEST_MODEL_SUFFIX) best_model_saver = Saver( save_prefix=best_model_prefix, num_max_keeping=training_configs['num_kept_best_model']) # 1. Build Model & Criterion INFO('Building model...') timer.tic() nmt_model = build_model(n_src_vocab=vocab_src.max_n_words, n_tgt_vocab=vocab_tgt.max_n_words, padding_idx=vocab_src.pad, vocab_src=vocab_src, vocab_tgt=vocab_tgt, **model_configs) INFO(nmt_model) critic = NMTCriterion(label_smoothing=model_configs['label_smoothing'], padding_idx=vocab_tgt.pad) INFO(critic) # 2. Move to GPU if Constants.USE_GPU: nmt_model = nmt_model.cuda() critic = critic.cuda() # 3. Load pretrained model if needed load_pretrained_model(nmt_model, flags.pretrain_path, exclude_prefix=flags.pretrain_exclude_prefix, device=Constants.CURRENT_DEVICE) # froze_parameters froze_params(nmt_model, flags.froze_config) INFO('Done. Elapsed time {0}'.format(timer.toc())) # 4. Build optimizer INFO('Building Optimizer...') if not flags.multi_gpu: optim = Optimizer(name=optimizer_configs['optimizer'], model=nmt_model, lr=lrate, grad_clip=optimizer_configs['grad_clip'], optim_args=optimizer_configs['optimizer_params'], update_cycle=training_configs['update_cycle']) else: optim = dist.DistributedOptimizer( name=optimizer_configs['optimizer'], model=nmt_model, lr=lrate, grad_clip=optimizer_configs['grad_clip'], optim_args=optimizer_configs['optimizer_params'], device_id=local_rank) # 5. Build scheduler for optimizer if needed scheduler = build_scheduler( schedule_method=optimizer_configs['schedule_method'], optimizer=optim, scheduler_configs=optimizer_configs['scheduler_configs']) # 6. build moving average if training_configs['moving_average_method'] is not None: ma = MovingAverage( moving_average_method=training_configs['moving_average_method'], named_params=nmt_model.named_parameters(), alpha=training_configs['moving_average_alpha']) else: ma = None INFO('Done. Elapsed time {0}'.format(timer.toc())) # Reload from latest checkpoint if flags.reload: checkpoint_saver.load_latest(model=nmt_model, optim=optim, lr_scheduler=scheduler, collections=model_collections, ma=ma, device=Constants.CURRENT_DEVICE) # broadcast parameters and optimizer states if world_size > 1: INFO("Broadcasting model parameters...") dist.broadcast_parameters(params=nmt_model.state_dict()) INFO("Broadcasting optimizer states...") dist.broadcast_optimizer_state(optimizer=optim.optim) INFO('Done.') # ================================================================================== # # Prepare training eidx = model_collections.get_collection("eidx", [0])[-1] uidx = model_collections.get_collection("uidx", [1])[-1] bad_count = model_collections.get_collection("bad_count", [0])[-1] oom_count = model_collections.get_collection("oom_count", [0])[-1] is_early_stop = model_collections.get_collection("is_early_stop", [ False, ])[-1] train_loss_meter = AverageMeter() sent_per_sec_meter = TimeMeter() tok_per_sec_meter = TimeMeter() update_cycle = training_configs['update_cycle'] grad_denom = 0 train_loss = 0.0 cum_n_words = 0 valid_loss = best_valid_loss = float('inf') if rank == 0: summary_writer = SummaryWriter(log_dir=flags.log_path) else: summary_writer = None sent_per_sec_meter.start() tok_per_sec_meter.start() INFO('Begin training...') while True: if summary_writer is not None: summary_writer.add_scalar("Epoch", (eidx + 1), uidx) # Build iterator and progress bar training_iter = training_iterator.build_generator() if rank == 0: training_progress_bar = tqdm(desc=' - (Epc {}, Upd {}) '.format( eidx, uidx), total=len(training_iterator), unit="sents") else: training_progress_bar = None # INFO(Constants.USE_BT) for batch in training_iter: # bt attrib data seqs_x, seqs_y = batch batch_size = len(seqs_x) cum_n_words += sum(len(s) for s in seqs_y) try: # Prepare data x, y = prepare_data(seqs_x, seqs_y, cuda=Constants.USE_GPU) loss = compute_forward( model=nmt_model, critic=critic, seqs_x=x, seqs_y=y, eval=False, normalization=1.0, norm_by_words=training_configs["norm_by_words"]) update_cycle -= 1 grad_denom += batch_size train_loss += loss except RuntimeError as e: if 'out of memory' in str(e): print('| WARNING: ran out of memory, skipping batch') oom_count += 1 else: raise e # When update_cycle becomes 0, it means end of one batch. Several things will be done: # - update parameters # - reset update_cycle and grad_denom, update uidx # - learning rate scheduling # - update moving average if update_cycle == 0: # 0. reduce variables if world_size > 1: grad_denom = dist.all_reduce_py(grad_denom) train_loss = dist.all_reduce_py(train_loss) cum_n_words = dist.all_reduce_py(cum_n_words) # 1. update parameters optim.step(denom=grad_denom) optim.zero_grad() if training_progress_bar is not None: training_progress_bar.update(grad_denom) training_progress_bar.set_description( ' - (Epc {}, Upd {}) '.format(eidx, uidx)) postfix_str = 'TrainLoss: {:.2f}, ValidLoss(best): {:.2f} ({:.2f}), '.format( train_loss, valid_loss, best_valid_loss) training_progress_bar.set_postfix_str(postfix_str) # 2. learning rate scheduling if scheduler is not None and optimizer_configs[ "schedule_method"] != "loss": scheduler.step(global_step=uidx) # 3. update moving average if ma is not None and eidx >= training_configs[ 'moving_average_start_epoch']: ma.step() # 4. update meters train_loss_meter.update(train_loss, grad_denom) sent_per_sec_meter.update(grad_denom) tok_per_sec_meter.update(cum_n_words) # 5. reset accumulated variables, update uidx update_cycle = training_configs['update_cycle'] grad_denom = 0 uidx += 1 cum_n_words = 0.0 train_loss = 0.0 else: continue # ================================================================================== # # Display some information if should_trigger_by_steps( uidx, eidx, every_n_step=training_configs['disp_freq']): lrate = list(optim.get_lrate())[0] if summary_writer is not None: summary_writer.add_scalar( "Speed(sents/sec)", scalar_value=sent_per_sec_meter.ave, global_step=uidx) summary_writer.add_scalar( "Speed(words/sec)", scalar_value=tok_per_sec_meter.ave, global_step=uidx) summary_writer.add_scalar( "train_loss", scalar_value=train_loss_meter.ave, global_step=uidx) summary_writer.add_scalar("lrate", scalar_value=lrate, global_step=uidx) summary_writer.add_scalar("oom_count", scalar_value=oom_count, global_step=uidx) # Reset Meters sent_per_sec_meter.reset() tok_per_sec_meter.reset() train_loss_meter.reset() # ================================================================================== # # Saving checkpoints # if should_trigger_by_steps(uidx, eidx, every_n_step=training_configs['save_freq'], debug=flags.debug): # model_collections.add_to_collection("uidx", uidx) # model_collections.add_to_collection("eidx", eidx) # model_collections.add_to_collection("bad_count", bad_count) # # if not is_early_stop: # if rank == 0: # checkpoint_saver.save(global_step=uidx, # model=nmt_model, # optim=optim, # lr_scheduler=scheduler, # collections=model_collections, # ma=ma) torch.save(nmt_model.state_dict(), best_model_prefix + ".final") if training_progress_bar is not None: training_progress_bar.close() eidx += 1 if eidx > training_configs["max_epochs"]: break
class PhoenixVideo(Dataset): def __init__(self, vocab_file, corpus_dir, video_path, phase, DEBUG=False): """ :param phase: 'train', 'dev', 'test' """ self.vocab_file = vocab_file self.image_type = 'png' self.max_video_len = 300 self.corpus_dir = corpus_dir self.video_path = video_path self.phase = phase self.sample = True self.input_shape = 112 self.alignment = {} self.vocab = Vocabulary(self.vocab_file) normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) self.transform = transforms.Compose([ transforms.Resize((128, 128)), transforms.RandomCrop(self.input_shape), transforms.ToTensor(), normalize, ]) self.test_transform = transforms.Compose([ transforms.Resize((128, 128)), transforms.CenterCrop(self.input_shape), transforms.ToTensor(), normalize, ]) self.phoenix_dataset = self.load_video_list() self.data_dict = self.phoenix_dataset[phase] if DEBUG == True: self.data_dict = self.data_dict[:101] logging.info('[DATASET: {:s}]: total {:d} samples.'.format( phase, len(self.data_dict))) def __len__(self): return len(self.data_dict) def __getitem__(self, idx): cur_vid_info = self.data_dict[idx] id = cur_vid_info['id'] frames_list = self.get_images(cur_vid_info['path']) label = cur_vid_info['label'] data_len = len(frames_list) # frame number sample = { 'id': id, 'data': frames_list, 'label': label, "data_len": data_len } return sample def load_video_list(self): phoenix_dataset = {} outliers = ['13April_2011_Wednesday_tagesschau_default-14' ] # '05July_2010_Monday_heute_default-8' for task in ['train', 'dev', 'test']: if task != self.phase: continue dataset_path = os.path.join(self.video_path, task) corpus = pd.read_csv(os.path.join(self.corpus_dir, '{:s}.corpus.csv'.format(task)), sep='|') videonames = corpus['folder'].values annotation = corpus['annotation'].values ids = corpus['id'].values num_sample = len(ids) video_infos = [] for i in range(num_sample): if ids[i] in outliers: continue tmp_info = { 'id': ids[i], 'path': os.path.join(self.video_path, task, videonames[i].replace('*.png', '')), 'label_text': annotation[i], 'label': self.sentence2index(annotation[i]) } video_infos.append(tmp_info) phoenix_dataset[task] = video_infos return phoenix_dataset def sentence2index(self, sent): sent = sent.split(' ') s = [] for word in sent: if word in self.vocab.word2index: s.append(self.vocab.word2index[word]) else: s.append(self.vocab.word2index['<UNK>']) return s def load_video(self, video_name): feat = caffeFeatureLoader.loadVideoC3DFeature(video_name, 'pool5') feat = torch.tensor(feat) return feat def get_images(self, video_name): frames_list = glob.glob( os.path.join(video_name, '*.{:s}'.format(self.image_type))) frames_list.sort() num_frame = len(frames_list) if self.phase == 'train' and self.sample and num_frame > self.max_video_len: # first, Randomly repeat 20%. Second, Randomly delete 20% ids = list(range(num_frame)) add_idx = random.sample(ids, int(0.2 * len(ids))) ids.extend(add_idx) ids.sort() ids = random.sample(ids, int(0.8 * len(ids))) ids.sort() if len(ids) > self.max_video_len: ids = random.sample(ids, self.max_video_len) ids.sort() frames_list = [frames_list[i] for i in ids] return frames_list def load_video_from_images(self, frames_list): frames_tensor_list = [ self.load_image(frame_file, self.phase) for frame_file in frames_list ] video_tensor = torch.stack(frames_tensor_list, dim=0) return video_tensor def load_image(self, img_name, phase, reduce_mean=True): image = Image.open(img_name) if phase == "train": image = self.transform(image) elif phase == "test" or phase == "dev": image = self.test_transform(image) return image def collate_fn_video(self, batch, padding=6): # batch.sort(key=lambda x: x['data'].shape[0], reverse=True) len_video = [x["data_len"] for x in batch] len_label = [len(x['label']) for x in batch] batch_video = torch.zeros(len(len_video), max(len_video), 3, self.input_shape, self.input_shape) # padding with zeros batch_decoder_label = torch.zeros(len(len_video), max(len_label) + 2).long() # [batch, max_len_label] batch_label = [] IDs = [] len_decoder_label = [] for i, bat in enumerate(batch): data = self.load_video_from_images(bat['data']) label = bat['label'] len_decoder_label.append(len_label[i] + 2) batch_label.extend(label) batch_decoder_label[i, 1:1 + len(label)] = torch.LongTensor(label) batch_decoder_label[i, 0] = self.vocab.bos() # bos batch_decoder_label[i, 1 + len(label)] = self.vocab.eos() # eos batch_video[i, :len_video[i], :] = torch.FloatTensor(data) IDs.append(bat['id']) batch_label = torch.LongTensor(batch_label) batch_decoder_label = torch.LongTensor(batch_decoder_label) len_video = torch.LongTensor(len_video) len_label = torch.LongTensor(len_label) len_decoder_label = torch.LongTensor(len_decoder_label) # batch_video = batch_video.permute(0, 2, 1) return { 'data': batch_video, 'label': batch_label, 'decoder_label': batch_decoder_label, 'len_data': len_video, 'len_label': len_label, 'len_decoder_label': len_decoder_label, 'id': IDs }
def ensemble_inference(valid_iterator, models, vocab_tgt: Vocabulary, batch_size, max_steps, beam_size=5, alpha=-1.0, rank=0, world_size=1, using_numbering_iterator=True): for model in models: model.eval() trans_in_all_beams = [[] for _ in range(beam_size)] # assert keep_n_beams <= beam_size if using_numbering_iterator: numbers = [] if rank == 0: infer_progress_bar = tqdm(total=len(valid_iterator), desc=' - (Infer) ', unit="sents") else: infer_progress_bar = None valid_iter = valid_iterator.build_generator(batch_size=batch_size) for batch in valid_iter: seq_numbers = batch[0] if using_numbering_iterator: numbers += seq_numbers seqs_x = batch[1] if infer_progress_bar is not None: infer_progress_bar.update(len(seqs_x) * world_size) x = prepare_data(seqs_x, seqs_y=None, cuda=Constants.USE_GPU) with torch.no_grad(): word_ids = ensemble_beam_search(nmt_models=models, beam_size=beam_size, max_steps=max_steps, src_seqs=x, alpha=alpha) word_ids = word_ids.cpu().numpy().tolist() # Append result for sent_t in word_ids: for ii, sent_ in enumerate(sent_t): sent_ = vocab_tgt.ids2sent(sent_) if sent_ == "": sent_ = '%s' % vocab_tgt.id2token(vocab_tgt.eos) trans_in_all_beams[ii].append(sent_) if infer_progress_bar is not None: infer_progress_bar.close() if world_size > 1: if using_numbering_iterator: numbers = dist.all_gather_py_with_shared_fs(numbers) trans_in_all_beams = [ combine_from_all_shards(trans) for trans in trans_in_all_beams ] if using_numbering_iterator: origin_order = np.argsort(numbers).tolist() trans_in_all_beams = [[trans[ii] for ii in origin_order] for trans in trans_in_all_beams] return trans_in_all_beams
def translate(FLAGS): GlobalNames.USE_GPU = FLAGS.use_gpu if FLAGS.multi_gpu: if hvd is None or distributed is None: ERROR("Distributed training is disable. Please check the installation of Horovod.") hvd.init() world_size = hvd.size() rank = hvd.rank() if GlobalNames.USE_GPU: torch.cuda.set_device(hvd.local_rank()) else: world_size = 1 rank = 0 if rank != 0: close_logging() config_path = os.path.abspath(FLAGS.config_path) with open(config_path.strip()) as f: configs = yaml.load(f) data_configs = configs['data_configs'] model_configs = configs['model_configs'] timer = Timer() # ================================================================================== # # Load Data INFO('Loading data...') timer.tic() # Generate target dictionary vocab_src = Vocabulary(**data_configs["vocabularies"][0]) vocab_tgt = Vocabulary(**data_configs["vocabularies"][1]) valid_dataset = TextLineDataset(data_path=FLAGS.source_path, vocabulary=vocab_src) valid_iterator = DataIterator(dataset=valid_dataset, batch_size=FLAGS.batch_size, use_bucket=True, buffer_size=100000, numbering=True, world_size=world_size, rank=rank ) INFO('Done. Elapsed time {0}'.format(timer.toc())) # ================================================================================== # # Build Model & Sampler & Validation INFO('Building model...') timer.tic() nmt_model = build_model(n_src_vocab=vocab_src.max_n_words, n_tgt_vocab=vocab_tgt.max_n_words, **model_configs) nmt_model.eval() INFO('Done. Elapsed time {0}'.format(timer.toc())) INFO('Reloading model parameters...') timer.tic() params = load_model_parameters(FLAGS.model_path, map_location="cpu") nmt_model.load_state_dict(params, strict=False) if GlobalNames.USE_GPU: nmt_model.cuda() INFO('Done. Elapsed time {0}'.format(timer.toc())) INFO('Begin...') result_numbers = [] result = [] n_words = 0 timer.tic() if rank == 0: infer_progress_bar = tqdm(total=len(valid_iterator), desc=' - (Infer) ', unit="sents") else: infer_progress_bar = None valid_iter = valid_iterator.build_generator() for batch in valid_iter: numbers, seqs_x = batch batch_size_t = len(seqs_x) x = prepare_data(seqs_x=seqs_x, cuda=GlobalNames.USE_GPU) with torch.no_grad(): word_ids = beam_search(nmt_model=nmt_model, beam_size=FLAGS.beam_size, max_steps=FLAGS.max_steps, src_seqs=x, alpha=FLAGS.alpha) word_ids = word_ids.cpu().numpy().tolist() # Append result for sent_t in word_ids: sent_t = [[wid for wid in line if wid != PAD] for line in sent_t] result.append(sent_t) n_words += len(sent_t[0]) result_numbers += numbers if rank == 0: infer_progress_bar.update(batch_size_t * world_size) if rank == 0: infer_progress_bar.close() if FLAGS.multi_gpu: n_words = sum(distributed.all_gather(n_words)) INFO('Done. Speed: {0:.2f} words/sec'.format(n_words / (timer.toc(return_seconds=True)))) if FLAGS.multi_gpu: result_gathered = distributed.all_gather_with_shared_fs(result) result = [] for lines in itertools.zip_longest(*result_gathered, fillvalue=None): for line in lines: if line is not None: result.append(line) result_numbers_gathered = distributed.all_gather_with_shared_fs(result_numbers) result_numbers = [] for numbers in itertools.zip_longest(*result_numbers_gathered, fillvalue=None): for num in numbers: if num is not None: result_numbers.append(num) if rank == 0: translation = [] for sent in result: samples = [] for trans in sent: sample = [] for w in trans: if w == vocab_tgt.EOS: break sample.append(vocab_tgt.id2token(w)) samples.append(vocab_tgt.tokenizer.detokenize(sample)) translation.append(samples) # resume the ordering origin_order = np.argsort(result_numbers).tolist() translation = [translation[ii] for ii in origin_order] keep_n = FLAGS.beam_size if FLAGS.keep_n <= 0 else min(FLAGS.beam_size, FLAGS.keep_n) outputs = ['%s.%d' % (FLAGS.saveto, i) for i in range(keep_n)] with batch_open(outputs, 'w') as handles: for trans in translation: for i in range(keep_n): if i < len(trans): handles[i].write('%s\n' % trans[i]) else: handles[i].write('%s\n' % 'eos')
def train(FLAGS): """ FLAGS: saveto: str reload: store_true config_path: str pretrain_path: str, default="" model_name: str log_path: str """ # ================================================================================== # # Initialization for training on different devices # - CPU/GPU # - Single/Distributed GlobalNames.USE_GPU = FLAGS.use_gpu if FLAGS.multi_gpu: if hvd is None or distributed is None: ERROR("Distributed training is disable. Please check the installation of Horovod.") hvd.init() world_size = hvd.size() rank = hvd.rank() local_rank = hvd.local_rank() else: world_size = 1 rank = 0 local_rank = 0 if GlobalNames.USE_GPU: torch.cuda.set_device(local_rank) CURRENT_DEVICE = "cuda:{0}".format(local_rank) else: CURRENT_DEVICE = "cpu" # If not root_rank, close logging if rank != 0: close_logging() # write log of training to file. if rank == 0: write_log_to_file(os.path.join(FLAGS.log_path, "%s.log" % time.strftime("%Y%m%d-%H%M%S"))) # ================================================================================== # # Parsing configuration files config_path = os.path.abspath(FLAGS.config_path) with open(config_path.strip()) as f: configs = yaml.load(f) INFO(pretty_configs(configs)) # Add default configs configs = default_baseline_configs(configs) data_configs = configs['data_configs'] model_configs = configs['model_configs'] optimizer_configs = configs['optimizer_configs'] training_configs = configs['training_configs'] GlobalNames.SEED = training_configs['seed'] set_seed(GlobalNames.SEED) timer = Timer() # ================================================================================== # # Load Data INFO('Loading data...') timer.tic() # Generate target dictionary vocab_src = Vocabulary(**data_configs["vocabularies"][0]) vocab_tgt = Vocabulary(**data_configs["vocabularies"][1]) actual_buffer_size = training_configs["buffer_size"] * max(1, training_configs["update_cycle"]) train_bitext_dataset = ZipDataset( TextLineDataset(data_path=data_configs['train_data'][0], vocabulary=vocab_src, max_len=data_configs['max_len'][0], ), TextLineDataset(data_path=data_configs['train_data'][1], vocabulary=vocab_tgt, max_len=data_configs['max_len'][1], ) ) valid_bitext_dataset = ZipDataset( TextLineDataset(data_path=data_configs['valid_data'][0], vocabulary=vocab_src, ), TextLineDataset(data_path=data_configs['valid_data'][1], vocabulary=vocab_tgt, ) ) training_iterator = DataIterator(dataset=train_bitext_dataset, batch_size=training_configs["batch_size"], use_bucket=training_configs['use_bucket'], buffer_size=actual_buffer_size, batching_func=training_configs['batching_key'], world_size=world_size, rank=rank) valid_iterator = DataIterator(dataset=valid_bitext_dataset, batch_size=training_configs['valid_batch_size'], use_bucket=True, buffer_size=100000, numbering=True, world_size=world_size, rank=rank) bleu_scorer = SacreBLEUScorer(reference_path=data_configs["bleu_valid_reference"], num_refs=data_configs["num_refs"], lang_pair=data_configs["lang_pair"], sacrebleu_args=training_configs["bleu_valid_configs"]['sacrebleu_args'], postprocess=training_configs["bleu_valid_configs"]['postprocess'] ) INFO('Done. Elapsed time {0}'.format(timer.toc())) lrate = optimizer_configs['learning_rate'] is_early_stop = False # ================================ Begin ======================================== # # Build Model & Optimizer # We would do steps below on after another # 1. build models & criterion # 2. move models & criterion to gpu if needed # 3. load pre-trained model if needed # 4. build optimizer # 5. build learning rate scheduler if needed # 6. load checkpoints if needed # 0. Initial model_collections = Collections() best_model_prefix = os.path.join(FLAGS.saveto, FLAGS.model_name + GlobalNames.MY_BEST_MODEL_SUFFIX) checkpoint_saver = Saver(save_prefix="{0}.ckpt".format(os.path.join(FLAGS.saveto, FLAGS.model_name)), num_max_keeping=training_configs['num_kept_checkpoints'] ) best_model_saver = Saver(save_prefix=best_model_prefix, num_max_keeping=training_configs['num_kept_best_model']) INFO('Building model...') timer.tic() nmt_model = build_model(n_src_vocab=vocab_src.max_n_words, n_tgt_vocab=vocab_tgt.max_n_words, **model_configs) INFO(nmt_model) critic = NMTCriterion(label_smoothing=model_configs['label_smoothing']) INFO(critic) INFO('Done. Elapsed time {0}'.format(timer.toc())) # 2. Move to GPU if GlobalNames.USE_GPU: nmt_model = nmt_model.cuda() critic = critic.cuda() # 3. Load pretrained model if needed load_pretrained_model(nmt_model, FLAGS.pretrain_path, exclude_prefix=None, device=CURRENT_DEVICE) # 4. Build optimizer INFO('Building Optimizer...') optim = Optimizer(name=optimizer_configs['optimizer'], model=nmt_model, lr=lrate, grad_clip=optimizer_configs['grad_clip'], optim_args=optimizer_configs['optimizer_params'], distributed=True if world_size > 1 else False, update_cycle=training_configs['update_cycle'] ) # 5. Build scheduler for optimizer if needed if optimizer_configs['schedule_method'] is not None: if optimizer_configs['schedule_method'] == "loss": scheduler = ReduceOnPlateauScheduler(optimizer=optim, **optimizer_configs["scheduler_configs"] ) elif optimizer_configs['schedule_method'] == "noam": scheduler = NoamScheduler(optimizer=optim, **optimizer_configs['scheduler_configs']) else: WARN("Unknown scheduler name {0}. Do not use lr_scheduling.".format(optimizer_configs['schedule_method'])) scheduler = None else: scheduler = None # 6. build moving average if training_configs['moving_average_method'] is not None: ma = MovingAverage(moving_average_method=training_configs['moving_average_method'], named_params=nmt_model.named_parameters(), alpha=training_configs['moving_average_alpha']) else: ma = None INFO('Done. Elapsed time {0}'.format(timer.toc())) # Reload from latest checkpoint if FLAGS.reload: checkpoint_saver.load_latest(model=nmt_model, optim=optim, lr_scheduler=scheduler, collections=model_collections, ma=ma) # broadcast parameters and optimizer states if world_size > 1: hvd.broadcast_parameters(params=nmt_model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer=optim.optim, root_rank=0) # ================================================================================== # # Prepare training eidx = model_collections.get_collection("eidx", [0])[-1] uidx = model_collections.get_collection("uidx", [1])[-1] bad_count = model_collections.get_collection("bad_count", [0])[-1] oom_count = model_collections.get_collection("oom_count", [0])[-1] cum_n_samples = 0 cum_n_words = 0 best_valid_loss = 1.0 * 1e10 # Max Float update_cycle = training_configs['update_cycle'] grad_denom = 0 if rank == 0: summary_writer = SummaryWriter(log_dir=FLAGS.log_path) else: summary_writer = None # Timer for computing speed timer_for_speed = Timer() timer_for_speed.tic() INFO('Begin training...') while True: if summary_writer is not None: summary_writer.add_scalar("Epoch", (eidx + 1), uidx) # Build iterator and progress bar training_iter = training_iterator.build_generator() if rank == 0: training_progress_bar = tqdm(desc=' - (Epoch %d) ' % eidx, total=len(training_iterator), unit="sents" ) else: training_progress_bar = None for batch in training_iter: seqs_x, seqs_y = batch batch_size = len(seqs_x) cum_n_samples += batch_size cum_n_words += sum(len(s) for s in seqs_y) try: # Prepare data x, y = prepare_data(seqs_x, seqs_y, cuda=GlobalNames.USE_GPU) loss = compute_forward(model=nmt_model, critic=critic, seqs_x=x, seqs_y=y, eval=False, normalization=1.0, norm_by_words=training_configs["norm_by_words"]) update_cycle -= 1 grad_denom += batch_size except RuntimeError as e: if 'out of memory' in str(e): print('| WARNING: ran out of memory, skipping batch') oom_count += 1 else: raise e # When update_cycle becomes 0, it means end of one batch. Several things will be done: # - update parameters # - reset update_cycle and grad_denom # - update uidx # - update moving average if update_cycle == 0: if world_size > 1: grad_denom = distributed.all_reduce(grad_denom) optim.step(denom=grad_denom) optim.zero_grad() if training_progress_bar is not None: training_progress_bar.update(grad_denom) update_cycle = training_configs['update_cycle'] grad_denom = 0 uidx += 1 if scheduler is None: pass elif optimizer_configs["schedule_method"] == "loss": scheduler.step(metric=best_valid_loss) else: scheduler.step(global_step=uidx) if ma is not None and eidx >= training_configs['moving_average_start_epoch']: ma.step() else: continue # ================================================================================== # # Display some information if should_trigger_by_steps(uidx, eidx, every_n_step=training_configs['disp_freq']): if world_size > 1: cum_n_words = sum(distributed.all_gather(cum_n_words)) cum_n_samples = sum(distributed.all_gather(cum_n_samples)) # words per second and sents per second words_per_sec = cum_n_words / (timer.toc(return_seconds=True)) sents_per_sec = cum_n_samples / (timer.toc(return_seconds=True)) lrate = list(optim.get_lrate())[0] if summary_writer is not None: summary_writer.add_scalar("Speed(words/sec)", scalar_value=words_per_sec, global_step=uidx) summary_writer.add_scalar("Speed(sents/sen)", scalar_value=sents_per_sec, global_step=uidx) summary_writer.add_scalar("lrate", scalar_value=lrate, global_step=uidx) summary_writer.add_scalar("oom_count", scalar_value=oom_count, global_step=uidx) # Reset timer timer.tic() cum_n_words = 0 cum_n_samples = 0 # ================================================================================== # # Loss Validation & Learning rate annealing if should_trigger_by_steps(global_step=uidx, n_epoch=eidx, every_n_step=training_configs['loss_valid_freq'], debug=FLAGS.debug): valid_loss = loss_validation(model=nmt_model, critic=critic, valid_iterator=valid_iterator, rank=rank, world_size=world_size ) model_collections.add_to_collection("history_losses", valid_loss) min_history_loss = np.array(model_collections.get_collection("history_losses")).min() best_valid_loss = min_history_loss if summary_writer is not None: summary_writer.add_scalar("loss", valid_loss, global_step=uidx) summary_writer.add_scalar("best_loss", min_history_loss, global_step=uidx) # ================================================================================== # # BLEU Validation & Early Stop if should_trigger_by_steps(global_step=uidx, n_epoch=eidx, every_n_step=training_configs['bleu_valid_freq'], min_step=training_configs['bleu_valid_warmup'], debug=FLAGS.debug): valid_bleu = bleu_validation(uidx=uidx, valid_iterator=valid_iterator, batch_size=training_configs["bleu_valid_batch_size"], model=nmt_model, bleu_scorer=bleu_scorer, vocab_tgt=vocab_tgt, valid_dir=FLAGS.valid_path, max_steps=training_configs["bleu_valid_configs"]["max_steps"], beam_size=training_configs["bleu_valid_configs"]["beam_size"], alpha=training_configs["bleu_valid_configs"]["alpha"], world_size=world_size, rank=rank, ) model_collections.add_to_collection(key="history_bleus", value=valid_bleu) best_valid_bleu = float(np.array(model_collections.get_collection("history_bleus")).max()) if summary_writer is not None: summary_writer.add_scalar("bleu", valid_bleu, uidx) summary_writer.add_scalar("best_bleu", best_valid_bleu, uidx) # If model get new best valid bleu score if valid_bleu >= best_valid_bleu: bad_count = 0 if is_early_stop is False: if rank == 0: # 1. save the best model torch.save(nmt_model.state_dict(), best_model_prefix + ".final") # 2. record all several best models best_model_saver.save(global_step=uidx, model=nmt_model, ma=ma) else: bad_count += 1 # At least one epoch should be traversed if bad_count >= training_configs['early_stop_patience'] and eidx > 0: is_early_stop = True WARN("Early Stop!") if summary_writer is not None: summary_writer.add_scalar("bad_count", bad_count, uidx) INFO("{0} Loss: {1:.2f} BLEU: {2:.2f} lrate: {3:6f} patience: {4}".format( uidx, valid_loss, valid_bleu, lrate, bad_count )) # ================================================================================== # # Saving checkpoints if should_trigger_by_steps(uidx, eidx, every_n_step=training_configs['save_freq'], debug=FLAGS.debug): model_collections.add_to_collection("uidx", uidx) model_collections.add_to_collection("eidx", eidx) model_collections.add_to_collection("bad_count", bad_count) if not is_early_stop: if rank == 0: checkpoint_saver.save(global_step=uidx, model=nmt_model, optim=optim, lr_scheduler=scheduler, collections=model_collections, ma=ma) if training_progress_bar is not None: training_progress_bar.close() eidx += 1 if eidx > training_configs["max_epochs"]: break
def train(FLAGS): """ FLAGS: saveto: str reload: store_true config_path: str pretrain_path: str, default="" model_name: str log_path: str """ # write log of training to file. write_log_to_file(os.path.join(FLAGS.log_path, "%s.log" % time.strftime("%Y%m%d-%H%M%S"))) GlobalNames.USE_GPU = FLAGS.use_gpu if GlobalNames.USE_GPU: CURRENT_DEVICE = "cpu" else: CURRENT_DEVICE = "cuda:0" config_path = os.path.abspath(FLAGS.config_path) with open(config_path.strip()) as f: configs = yaml.load(f) INFO(pretty_configs(configs)) # Add default configs configs = default_configs(configs) data_configs = configs['data_configs'] model_configs = configs['model_configs'] optimizer_configs = configs['optimizer_configs'] training_configs = configs['training_configs'] GlobalNames.SEED = training_configs['seed'] set_seed(GlobalNames.SEED) best_model_prefix = os.path.join(FLAGS.saveto, FLAGS.model_name + GlobalNames.MY_BEST_MODEL_SUFFIX) timer = Timer() # ================================================================================== # # Load Data INFO('Loading data...') timer.tic() # Generate target dictionary vocab_src = Vocabulary(**data_configs["vocabularies"][0]) vocab_tgt = Vocabulary(**data_configs["vocabularies"][1]) train_batch_size = training_configs["batch_size"] * max(1, training_configs["update_cycle"]) train_buffer_size = training_configs["buffer_size"] * max(1, training_configs["update_cycle"]) train_bitext_dataset = ZipDataset( TextLineDataset(data_path=data_configs['train_data'][0], vocabulary=vocab_src, max_len=data_configs['max_len'][0], ), TextLineDataset(data_path=data_configs['train_data'][1], vocabulary=vocab_tgt, max_len=data_configs['max_len'][1], ), shuffle=training_configs['shuffle'] ) valid_bitext_dataset = ZipDataset( TextLineDataset(data_path=data_configs['valid_data'][0], vocabulary=vocab_src, ), TextLineDataset(data_path=data_configs['valid_data'][1], vocabulary=vocab_tgt, ) ) training_iterator = DataIterator(dataset=train_bitext_dataset, batch_size=train_batch_size, use_bucket=training_configs['use_bucket'], buffer_size=train_buffer_size, batching_func=training_configs['batching_key']) valid_iterator = DataIterator(dataset=valid_bitext_dataset, batch_size=training_configs['valid_batch_size'], use_bucket=True, buffer_size=100000, numbering=True) bleu_scorer = SacreBLEUScorer(reference_path=data_configs["bleu_valid_reference"], num_refs=data_configs["num_refs"], lang_pair=data_configs["lang_pair"], sacrebleu_args=training_configs["bleu_valid_configs"]['sacrebleu_args'], postprocess=training_configs["bleu_valid_configs"]['postprocess'] ) INFO('Done. Elapsed time {0}'.format(timer.toc())) lrate = optimizer_configs['learning_rate'] is_early_stop = False # ================================ Begin ======================================== # # Build Model & Optimizer # We would do steps below on after another # 1. build models & criterion # 2. move models & criterion to gpu if needed # 3. load pre-trained model if needed # 4. build optimizer # 5. build learning rate scheduler if needed # 6. load checkpoints if needed # 0. Initial model_collections = Collections() checkpoint_saver = Saver(save_prefix="{0}.ckpt".format(os.path.join(FLAGS.saveto, FLAGS.model_name)), num_max_keeping=training_configs['num_kept_checkpoints'] ) best_model_saver = Saver(save_prefix=best_model_prefix, num_max_keeping=training_configs['num_kept_best_model']) # 1. Build Model & Criterion INFO('Building model...') timer.tic() nmt_model = build_model(n_src_vocab=vocab_src.max_n_words, n_tgt_vocab=vocab_tgt.max_n_words, **model_configs) INFO(nmt_model) critic = NMTCriterion(label_smoothing=model_configs['label_smoothing']) INFO(critic) INFO('Done. Elapsed time {0}'.format(timer.toc())) # 2. Move to GPU if GlobalNames.USE_GPU: nmt_model = nmt_model.cuda() critic = critic.cuda() # 3. Load pretrained model if needed load_pretrained_model(nmt_model, FLAGS.pretrain_path, exclude_prefix=None, device=CURRENT_DEVICE) # 4. Build optimizer INFO('Building Optimizer...') optim = Optimizer(name=optimizer_configs['optimizer'], model=nmt_model, lr=lrate, grad_clip=optimizer_configs['grad_clip'], optim_args=optimizer_configs['optimizer_params'] ) # 5. Build scheduler for optimizer if needed if optimizer_configs['schedule_method'] is not None: if optimizer_configs['schedule_method'] == "loss": scheduler = ReduceOnPlateauScheduler(optimizer=optim, **optimizer_configs["scheduler_configs"] ) elif optimizer_configs['schedule_method'] == "noam": scheduler = NoamScheduler(optimizer=optim, **optimizer_configs['scheduler_configs']) else: WARN("Unknown scheduler name {0}. Do not use lr_scheduling.".format(optimizer_configs['schedule_method'])) scheduler = None else: scheduler = None # 6. build EMA if training_configs['ema_decay'] > 0.0: ema = ExponentialMovingAverage(named_params=nmt_model.named_parameters(), decay=training_configs['ema_decay']) else: ema = None INFO('Done. Elapsed time {0}'.format(timer.toc())) # Reload from latest checkpoint if FLAGS.reload: checkpoint_saver.load_latest(model=nmt_model, optim=optim, lr_scheduler=scheduler, collections=model_collections) # ================================================================================== # # Prepare training eidx = model_collections.get_collection("eidx", [0])[-1] uidx = model_collections.get_collection("uidx", [0])[-1] bad_count = model_collections.get_collection("bad_count", [0])[-1] summary_writer = SummaryWriter(log_dir=FLAGS.log_path) cum_samples = 0 cum_words = 0 best_valid_loss = 1.0 * 1e10 # Max Float saving_files = [] # Timer for computing speed timer_for_speed = Timer() timer_for_speed.tic() INFO('Begin training...') while True: summary_writer.add_scalar("Epoch", (eidx + 1), uidx) # Build iterator and progress bar training_iter = training_iterator.build_generator() training_progress_bar = tqdm(desc=' - (Epoch %d) ' % eidx, total=len(training_iterator), unit="sents" ) for batch in training_iter: uidx += 1 if scheduler is None: pass elif optimizer_configs["schedule_method"] == "loss": scheduler.step(metric=best_valid_loss) else: scheduler.step(global_step=uidx) seqs_x, seqs_y = batch n_samples_t = len(seqs_x) n_words_t = sum(len(s) for s in seqs_y) cum_samples += n_samples_t cum_words += n_words_t training_progress_bar.update(n_samples_t) optim.zero_grad() # Prepare data for seqs_x_t, seqs_y_t in split_shard(seqs_x, seqs_y, split_size=training_configs['update_cycle']): x, y = prepare_data(seqs_x_t, seqs_y_t, cuda=GlobalNames.USE_GPU) loss = compute_forward(model=nmt_model, critic=critic, seqs_x=x, seqs_y=y, eval=False, normalization=n_samples_t, norm_by_words=training_configs["norm_by_words"]) optim.step() if ema is not None: ema.step() # ================================================================================== # # Display some information if should_trigger_by_steps(uidx, eidx, every_n_step=training_configs['disp_freq']): # words per second and sents per second words_per_sec = cum_words / (timer.toc(return_seconds=True)) sents_per_sec = cum_samples / (timer.toc(return_seconds=True)) lrate = list(optim.get_lrate())[0] summary_writer.add_scalar("Speed(words/sec)", scalar_value=words_per_sec, global_step=uidx) summary_writer.add_scalar("Speed(sents/sen)", scalar_value=sents_per_sec, global_step=uidx) summary_writer.add_scalar("lrate", scalar_value=lrate, global_step=uidx) # Reset timer timer.tic() cum_words = 0 cum_samples = 0 # ================================================================================== # # Saving checkpoints if should_trigger_by_steps(uidx, eidx, every_n_step=training_configs['save_freq'], debug=FLAGS.debug): model_collections.add_to_collection("uidx", uidx) model_collections.add_to_collection("eidx", eidx) model_collections.add_to_collection("bad_count", bad_count) if not is_early_stop: checkpoint_saver.save(global_step=uidx, model=nmt_model, optim=optim, lr_scheduler=scheduler, collections=model_collections, ema=ema) # ================================================================================== # # Loss Validation & Learning rate annealing if should_trigger_by_steps(global_step=uidx, n_epoch=eidx, every_n_step=training_configs['loss_valid_freq'], debug=FLAGS.debug): if ema is not None: origin_state_dict = deepcopy(nmt_model.state_dict()) nmt_model.load_state_dict(ema.state_dict(), strict=False) valid_loss = loss_validation(model=nmt_model, critic=critic, valid_iterator=valid_iterator, ) model_collections.add_to_collection("history_losses", valid_loss) min_history_loss = np.array(model_collections.get_collection("history_losses")).min() summary_writer.add_scalar("loss", valid_loss, global_step=uidx) summary_writer.add_scalar("best_loss", min_history_loss, global_step=uidx) best_valid_loss = min_history_loss if ema is not None: nmt_model.load_state_dict(origin_state_dict) del origin_state_dict # ================================================================================== # # BLEU Validation & Early Stop if should_trigger_by_steps(global_step=uidx, n_epoch=eidx, every_n_step=training_configs['bleu_valid_freq'], min_step=training_configs['bleu_valid_warmup'], debug=FLAGS.debug): if ema is not None: origin_state_dict = deepcopy(nmt_model.state_dict()) nmt_model.load_state_dict(ema.state_dict(), strict=False) valid_bleu = bleu_validation(uidx=uidx, valid_iterator=valid_iterator, batch_size=training_configs["bleu_valid_batch_size"], model=nmt_model, bleu_scorer=bleu_scorer, vocab_tgt=vocab_tgt, valid_dir=FLAGS.valid_path, max_steps=training_configs["bleu_valid_configs"]["max_steps"], beam_size=training_configs["bleu_valid_configs"]["beam_size"], alpha=training_configs["bleu_valid_configs"]["alpha"] ) model_collections.add_to_collection(key="history_bleus", value=valid_bleu) best_valid_bleu = float(np.array(model_collections.get_collection("history_bleus")).max()) summary_writer.add_scalar("bleu", valid_bleu, uidx) summary_writer.add_scalar("best_bleu", best_valid_bleu, uidx) # If model get new best valid bleu score if valid_bleu >= best_valid_bleu: bad_count = 0 if is_early_stop is False: # 1. save the best model torch.save(nmt_model.state_dict(), best_model_prefix + ".final") # 2. record all several best models best_model_saver.save(global_step=uidx, model=nmt_model) else: bad_count += 1 # At least one epoch should be traversed if bad_count >= training_configs['early_stop_patience'] and eidx > 0: is_early_stop = True WARN("Early Stop!") summary_writer.add_scalar("bad_count", bad_count, uidx) if ema is not None: nmt_model.load_state_dict(origin_state_dict) del origin_state_dict INFO("{0} Loss: {1:.2f} BLEU: {2:.2f} lrate: {3:6f} patience: {4}".format( uidx, valid_loss, valid_bleu, lrate, bad_count )) training_progress_bar.close() eidx += 1 if eidx > training_configs["max_epochs"]: break
def main(): opts = parse_args() init_logging(os.path.join(opts.log_dir, '{:s}_log.txt'.format(opts.task))) if torch.cuda.is_available(): torch.cuda.set_device(opts.gpu) logging.info("Using GPU!") device = "cuda" else: logging.info("Using CPU!") device = "cpu" logging.info(opts) train_datasets = PhoenixVideo(opts.vocab_file, opts.corpus_dir, opts.video_path, phase="train", DEBUG=opts.DEBUG) valid_datasets = PhoenixVideo(opts.vocab_file, opts.corpus_dir, opts.video_path, phase="dev", DEBUG=opts.DEBUG) vocab_size = valid_datasets.vocab.num_words blank_id = valid_datasets.vocab.word2index['<BLANK>'] vocabulary = Vocabulary(opts.vocab_file) #model = DilatedSLRNet(opts, device, vocab_size, vocabulary, # dilated_channels=512, num_blocks=5, dilations=[1, 2, 4], dropout=0.0) model = MainStream(vocab_size) criterion = CtcLoss(opts, blank_id, device, reduction="none") # print(model) # Build trainer trainer = Trainer(opts, model, criterion, vocabulary, vocab_size, blank_id) if os.path.exists(opts.check_point): logging.info("Loading checkpoint file from {}".format( opts.check_point)) epoch, num_updates, loss = trainer.load_checkpoint(opts.check_point) else: logging.info("No checkpoint file in found in {}".format( opts.check_point)) epoch, num_updates, loss = 0, 0, 0.0 trainer.set_num_updates(num_updates) model_manager = ModelManager(max_num_models=5) while epoch < opts.max_epoch and trainer.get_num_updates( ) < opts.max_updates: epoch += 1 trainer.adjust_learning_rate(epoch) #trainer.dynamic_freeze_layers(epoch) loss = train(opts, train_datasets, valid_datasets, trainer, epoch, num_updates, loss) #if num_updates % opts.save_interval_updates == 0: if epoch <= opts.stage_epoch * 2: phoenix_eval_err = eval(opts, valid_datasets, trainer, epoch) phoenix_eval_err = eval_tf(opts, valid_datasets, trainer, epoch) else: phoenix_eval_err = eval(opts, valid_datasets, trainer, epoch) phoenix_eval_err = eval_dec(opts, valid_datasets, trainer, epoch) save_ckpt = os.path.join( opts.log_dir, 'ep{:d}_{:.4f}.pkl'.format(epoch, phoenix_eval_err[0])) trainer.save_checkpoint(save_ckpt, epoch, num_updates, loss) model_manager.update(save_ckpt, phoenix_eval_err, epoch)
def main(): opts = parse_args() setup_seed(opts.seed) init_logging(os.path.join(opts.log_dir, '{:s}_seed{}_log.txt'.format(opts.task, opts.seed))) if torch.cuda.is_available(): torch.cuda.set_device(opts.gpu) logging.info("Using GPU!") device = "cuda" else: logging.info("Using CPU!") device = "cpu" logging.info(opts) train_datasets = PhoenixVideo(opts.vocab_file, opts.corpus_dir, opts.video_path, phase="train", DEBUG=opts.DEBUG) valid_datasets = PhoenixVideo(opts.vocab_file, opts.corpus_dir, opts.video_path, phase="dev", DEBUG=opts.DEBUG) vocab_size = valid_datasets.vocab.num_words blank_id = valid_datasets.vocab.word2index['<BLANK>'] vocabulary = Vocabulary(opts.vocab_file) model = MainStream(vocab_size, opts.bn_momentum) criterion = CtcLoss(opts, blank_id, device, reduction="none") ema = EMA(model, decay=0.999) # 初始化 ema.register() # print(model) # Build trainer trainer = Trainer(opts, model, criterion, vocabulary, vocab_size, blank_id) if os.path.exists(opts.check_point): logging.info("Loading checkpoint file from {}".format(opts.check_point)) epoch, num_updates, loss = trainer.load_checkpoint(opts.check_point) elif os.path.exists(opts.pretrain): logging.info("Loading checkpoint file from {}".format(opts.pretrain)) trainer.pretrain(opts) epoch, num_updates, loss = 0, 0, 0.0 else: logging.info("No checkpoint file in found in {}".format(opts.check_point)) epoch, num_updates, loss = 0, 0, 0.0 logging.info('| num. module params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) trainer.set_num_updates(num_updates) model_manager = ModelManager(max_num_models=25) while epoch < opts.max_epoch and trainer.get_num_updates() < opts.max_updates: epoch += 1 trainer.adjust_learning_rate(epoch) loss = train(opts, train_datasets, valid_datasets, trainer, epoch, num_updates, loss, ema) if epoch <= opts.stage_epoch: eval_train(opts, train_datasets, trainer, epoch) # phoenix_eval_err = eval_tf(opts, valid_datasets, trainer, epoch) phoenix_eval_err = eval(opts, valid_datasets, trainer, epoch, ema) else: # eval_train(opts, train_datasets, trainer, epoch) phoenix_eval_err = eval(opts, valid_datasets, trainer, epoch, ema) save_ckpt = os.path.join(opts.log_dir, 'ep{:d}_{:.4f}.pkl'.format(epoch, phoenix_eval_err[0])) trainer.save_checkpoint(save_ckpt, epoch, num_updates, loss) model_manager.update(save_ckpt, phoenix_eval_err, epoch)
def main(): opts = parse_args() init_logging(os.path.join(opts.log_dir, '{:s}_log.txt'.format(opts.task))) if torch.cuda.is_available(): torch.cuda.set_device(opts.gpu) logging.info("Using GPU!") device = "cuda" else: logging.info("Using CPU!") device = "cpu" logging.info(opts) test_datasets = PhoenixVideo(opts.vocab_file, opts.corpus_dir, opts.video_path, phase="train", DEBUG=opts.DEBUG) vocab_size = test_datasets.vocab.num_words blank_id = test_datasets.vocab.word2index['<BLANK>'] vocabulary = Vocabulary(opts.vocab_file) # model = DilatedSLRNet(opts, device, vocab_size, vocabulary, # dilated_channels=512, num_blocks=5, dilations=[1, 2, 4], dropout=0.0) model = MainStream(vocab_size) criterion = CtcLoss(opts, blank_id, device, reduction="none") trainer = Trainer(opts, model, criterion, vocabulary, vocab_size, blank_id) # ctcdeocde ctc_decoder_vocab = [chr(x) for x in range(20000, 20000 + vocab_size)] ctc_decoder = ctcdecode.CTCBeamDecoder(ctc_decoder_vocab, beam_width=opts.beam_width, blank_id=blank_id, num_processes=10) if os.path.exists(opts.check_point): logging.info("Loading checkpoint file from {}".format( opts.check_point)) epoch, num_updates, loss = trainer.load_checkpoint(opts.check_point) else: logging.info("No checkpoint file in found in {}".format( opts.check_point)) epoch, num_updates, loss = 0, 0, 0.0 test_iter = trainer.get_batch_iterator(test_datasets, batch_size=opts.batch_size, shuffle=False) video_sim = {} with torch.no_grad(): model.eval() criterion.eval() for i, samples in tqdm(enumerate(test_iter)): if i > 50: break samples = trainer._prepare_sample(samples) video = samples["data"] len_video = samples["len_data"] label = samples["label"] len_label = samples["len_label"] video_id = samples['id'] logits, _, scores1, scores2 = model(video, len_video) print(scores1) ids = scores1.topk(k=16, dim=-1)[1].sort(-1)[0] # [bs, t, t] bs, t, _ = scores1.size() for i in range(bs): for j in range(t): select_id = ids[i, j, :].cpu().numpy().tolist() for k in range(t): if k not in select_id: scores1[i, j, k] = 1e-9 print("scores1: ", scores1) scores1 = scores1.softmax(-1) mask = scores1 > 0.02 print(scores1, mask) scores1 *= mask.float() # sim_matrix = scores1.softmax(-1) # print(scores1[0, 0, :20]) # exit() for i in range(len(video_id)): video_sim[video_id[i]] = scores1[i].cpu().numpy() # print(video_sim) with open("Data/output/sim_matrix.pkl", "wb") as f: pickle.dump(video_sim, f)
def load_or_extract_near_vocab(config_path, model_path, save_to, save_to_full, init_perturb_rate=0, batch_size=50, top_reserve=12, all_with_UNK=False, reload=True, emit_as_id=False): """based on the embedding parameter from Encoder, extract near vocabulary for all words return: dictionary of vocabulary of near vocabs; and a the saved file :param config_path: (string) victim configs (for training data and vocabulary) :param model_path: (string) victim model path for trained embeddings :param save_to: (string) directory to store distilled near-vocab :param save_to_full: (string) directory to store full near-vocab :param init_perturb_rate: (float) the weight-adjustment for perturb :param batch_size: (integer) extract near vocab by batched cosine/Euclidean-similarity :param top_reserve: (integer) at most reserve top-k near candidates :param all_with_UNK: during generation, add UNK to all tokens as a candidate :param reload: reload from the save_to_path if previous record exists :param emit_as_id: (boolean) the key in return will be token ids instead of token """ # load configs with open(config_path.strip()) as f: configs = yaml.load(f) data_configs = configs["data_configs"] model_configs = configs["model_configs"] # load vocabulary file src_vocab = Vocabulary(**data_configs["vocabularies"][0]) # load embedding from model emb = nn.Embedding(num_embeddings=src_vocab.max_n_words, embedding_dim=model_configs["d_word_vec"], padding_idx=PAD ) model_params = torch.load(model_path, map_location="cpu") emb.load_state_dict({"weight": model_params["model"]["encoder.embeddings.embeddings.weight"]}, strict=True) len_mat = torch.sum(emb.weight**2, dim=1)**0.5 # length of the embeddings if os.path.exists(save_to) and reload: print("load from %s:" % save_to) return load_perturb_weight(save_to, src_vocab, emit_as_id) else: print("collect near candidates for vocabulary") avg_dist = 0 avg_std = [] counter = 0 word2p = OrderedDict() word2near_vocab = OrderedDict() # omit similar vocabulary file (batched) with open(save_to, "w") as similar_vocab, open(save_to_full, "w") as full_similar_vocab: # every batched vocabulary collect average E-dist for i in range((src_vocab.max_n_words//batch_size)+1): if i*batch_size == src_vocab.max_n_words: break index = torch.tensor(range(i*batch_size, min(src_vocab.max_n_words, (i+1)*batch_size), 1)) # extract embedding data slice_emb = emb(index) collect_len = torch.mm(len_mat.narrow(0, i * batch_size, min(src_vocab.max_n_words, (i+1)*batch_size)-i*batch_size).unsqueeze(1), len_mat.unsqueeze(0)) # filter top 10 nearest vocab, then filter with Eul-distance within certain range similarity = torch.mm(slice_emb, emb.weight.t()).div(collect_len) # get value and index topk_index = similarity.topk(top_reserve, dim=1)[1] sliceemb = slice_emb.unsqueeze(dim=1).repeat(1, top_reserve, 1) # [batch_size, 1*8, dim] E_dist = ((emb(topk_index)-sliceemb)**2).sum(dim=-1)**0.5 # print("avg Euclidean distance:", E_dist) avg_dist += E_dist.mean() avg_std += [E_dist.std(dim=1).mean()] counter += 1 avg_dist = avg_dist.item() / counter # print(avg_dist) # tensor object # print(avg_std) # output near candidates to file and return dictionary for i in range((src_vocab.max_n_words//batch_size)+1): if i*batch_size == src_vocab.max_n_words: break index = torch.tensor(range(i*batch_size, min(src_vocab.max_n_words, (i+1)*batch_size), 1)) # extract embedding data slice_emb = emb(index) collect_len = torch.mm(len_mat.narrow(0, i * batch_size, min(src_vocab.max_n_words, (i+1)*batch_size)-i*batch_size).unsqueeze(1), len_mat.unsqueeze(0)) # filter top k nearest vocab with cosine-similarity similarity = torch.mm(slice_emb, emb.weight.t()).div(collect_len) topk_val, topk_indices = similarity.topk(top_reserve, dim=1) # calculate E-dist sliceemb = slice_emb.unsqueeze(dim=1).repeat(1, top_reserve, 1) # [batch_size, 1*topk, dim] E_dist = ((emb(topk_indices)-sliceemb)**2).sum(dim=-1)**0.5 topk_val = E_dist.cpu().detach().numpy() topk_indices = topk_indices.cpu().detach().numpy() for j in range(topk_val.shape[0]): bingo = 0. src_word_id = j + i*batch_size src_word = src_vocab.id2token(src_word_id) near_vocab = [] similar_vocab.write(src_word + "\t") full_similar_vocab.write(src_word + "\t") # there is no candidates for reserved tokens if src_word_id in [PAD, EOS, BOS, UNK]: near_cand_id = src_word_id near_cand = src_vocab.id2token(near_cand_id) full_similar_vocab.write(near_cand + "\t") similar_vocab.write(near_cand + "\t") bingo = 1 if emit_as_id: near_vocab += [near_cand_id] else: near_vocab += [near_cand] else: # extract near candidates according to cos-dist within averaged E-dist for k in range(1, topk_val.shape[1]): near_cand_id = topk_indices[j][k] near_cand = src_vocab.id2token(near_cand_id) full_similar_vocab.write(near_cand + "\t") if topk_val[j][k] < avg_dist and (near_cand_id not in [PAD, EOS, BOS]): bingo += 1 similar_vocab.write(near_cand + "\t") if emit_as_id: near_vocab += [near_cand_id] else: near_vocab += [near_cand] # additionally add UNK as candidates if bingo == 0 or all_with_UNK: last_cand_ids = [UNK] for final_reserve_id in last_cand_ids: last_cand = src_vocab.id2token(final_reserve_id) similar_vocab.write(last_cand + "\t") if emit_as_id: near_vocab += [final_reserve_id] else: near_vocab += [last_cand] probability = bingo/(len(src_word)*top_reserve) if init_perturb_rate != 0: probability *= init_perturb_rate similar_vocab.write("\t"+str(probability)+"\n") full_similar_vocab.write("\t"+str(probability)+"\n") if emit_as_id: word2near_vocab[src_word_id] = near_vocab word2p[src_word_id] = probability else: word2near_vocab[src_word] = near_vocab word2p[src_word] = probability return word2p, word2near_vocab
def main_2(): opts = parse_args() init_logging(os.path.join(opts.log_dir, '{:s}_log.txt'.format(opts.task))) if torch.cuda.is_available(): torch.cuda.set_device(opts.gpu) logging.info("Using GPU!") device = "cuda" else: logging.info("Using CPU!") device = "cpu" logging.info(opts) test_datasets = PhoenixVideo(opts.vocab_file, opts.corpus_dir, opts.video_path, phase=opts.task, DEBUG=opts.DEBUG) vocab_size = test_datasets.vocab.num_words blank_id = test_datasets.vocab.word2index['<BLANK>'] vocabulary = Vocabulary(opts.vocab_file) model = DilatedSLRNet(opts, device, vocab_size, vocabulary, dilated_channels=512, num_blocks=5, dilations=[1, 2, 4], dropout=0.0) criterion = CtcLoss(opts, blank_id, device, reduction="none") trainer = Trainer(opts, model, criterion, vocabulary, vocab_size, blank_id) # iterative decoder dec_generator = IterativeGenerate(vocabulary, model) if os.path.exists(opts.check_point): logging.info("Loading checkpoint file from {}".format( opts.check_point)) epoch, num_updates, loss = trainer.load_checkpoint(opts.check_point) else: logging.info("No checkpoint file in found in {}".format( opts.check_point)) epoch, num_updates, loss = 0, 0, 0.0 test_iter = trainer.get_batch_iterator(test_datasets, batch_size=opts.batch_size, shuffle=False) decoded_dict = {} with torch.no_grad(): model.eval() criterion.eval() val_err, val_correct, val_count = np.zeros([4]), 0, 0 for samples in tqdm(test_iter): samples = trainer._prepare_sample(samples) video = samples["data"] len_video = samples["len_data"] label = samples["label"] len_label = samples["len_label"] video_id = samples['id'] hypos = dec_generator.generate_ctcdecode(video, len_video) start = 0 for i, length in enumerate(len_label): end = start + length ref = label[start:end].tolist() # hyp = [x for x in pred_seq[i] if x != 0] # hyp = [x[0] for x in groupby(pred_seq[i][0][:out_seq_len[i][0]].tolist())] hyp = trainer.post_process_prediction(hypos[i][0]["tokens"]) # if i == 0: # if len(hyp) == 0: # logging.info("Here hyp is None!!!!") # logging.info("video id: {}".format(video_id[i])) # logging.info("ref: {}".format(" ".join(str(i) for i in ref))) # logging.info("hyp: {}".format(" ".join(str(i) for i in hyp))) # # logging.info("\n") decoded_dict[video_id[i]] = hyp val_correct += int(ref == hyp) err = get_wer_delsubins(ref, hyp) val_err += np.array(err) val_count += 1 start = end assert end == label.size(0) logging.info('-' * 50) logging.info('Epoch: {:d}, DEV ACC: {:.5f}, {:d}/{:d}'.format( epoch, val_correct / val_count, val_correct, val_count)) logging.info( 'Epoch: {:d}, DEV WER: {:.5f}, SUB: {:.5f}, INS: {:.5f}, DEL: {:.5f}' .format(epoch, val_err[0] / val_count, val_err[1] / val_count, val_err[2] / val_count, val_err[3] / val_count)) list_str_for_test = [] for k, v in decoded_dict.items(): start_time = 0 for wi in v: tl = np.random.random() * 0.1 list_str_for_test.append('{} 1 {:.3f} {:.3f} {}\n'.format( k, start_time, start_time + tl, test_datasets.vocab.index2word[wi])) start_time += tl tmp_prefix = str(uuid.uuid1()) txt_file = '{:s}.txt'.format(tmp_prefix) result_file = os.path.join('evaluation_relaxation', txt_file) with open(result_file, 'w') as fid: fid.writelines(list_str_for_test) phoenix_eval_err = get_phoenix_wer(txt_file, opts.task, tmp_prefix) logging.info( '[Relaxation Evaluation] Epoch: {:d}, DEV WER: {:.5f}, SUB: {:.5f}, INS: {:.5f}, DEL: {:.5f}' .format(epoch, phoenix_eval_err[0], phoenix_eval_err[1], phoenix_eval_err[2], phoenix_eval_err[3])) return phoenix_eval_err
def train(flags): """ flags: saveto: str reload: store_true config_path: str pretrain_path: str, default="" model_name: str log_path: str """ # ================================================================================== # # Initialization for training on different devices # - CPU/GPU # - Single/Distributed Constants.USE_GPU = flags.use_gpu world_size = 1 rank = 0 local_rank = 0 if Constants.USE_GPU: torch.cuda.set_device(local_rank) Constants.CURRENT_DEVICE = "cuda:{0}".format(local_rank) else: Constants.CURRENT_DEVICE = "cpu" # If not root_rank, close logging # else write log of training to file. if rank == 0: write_log_to_file( os.path.join(flags.log_path, "%s.log" % time.strftime("%Y%m%d-%H%M%S"))) else: close_logging() # ================================================================================== # # Parsing configuration files # - Load default settings # - Load pre-defined settings # - Load user-defined settings configs = prepare_configs(flags.config_path, flags.predefined_config) data_configs = configs['data_configs'] model_configs = configs['model_configs'] optimizer_configs = configs['optimizer_configs'] training_configs = configs['training_configs'] INFO(pretty_configs(configs)) Constants.SEED = training_configs['seed'] set_seed(Constants.SEED) timer = Timer() # ================================================================================== # # Load Data INFO('Loading data...') timer.tic() # Generate target dictionary vocab_src = Vocabulary.build_from_file(**data_configs['vocabularies'][0]) Constants.EOS = vocab_src.eos Constants.PAD = vocab_src.pad Constants.BOS = vocab_src.bos train_bitext_dataset = TextLineDataset( data_path=data_configs['train_data'][0], vocabulary=vocab_src, max_len=data_configs['max_len'][0], is_train_dataset=True) valid_bitext_dataset = TextLineDataset( data_path=data_configs['valid_data'][0], vocabulary=vocab_src, is_train_dataset=False) training_iterator = DataIterator( dataset=train_bitext_dataset, batch_size=training_configs["batch_size"], use_bucket=training_configs['use_bucket'], buffer_size=training_configs['buffer_size'], batching_func=training_configs['batching_key'], world_size=world_size, rank=rank) valid_iterator = DataIterator( dataset=valid_bitext_dataset, batch_size=training_configs['valid_batch_size'], use_bucket=True, buffer_size=100000, numbering=True, shuffle=False, world_size=world_size, rank=rank) INFO('Done. Elapsed time {0}'.format(timer.toc())) # ================================ Begin ======================================== # # Build Model & Optimizer # We would do steps below on after another # 1. build models & criterion # 2. move models & criterion to gpu if needed # 3. load pre-trained model if needed # 4. build optimizer # 5. build learning rate scheduler if needed # 6. load checkpoints if needed # 0. Initial model_collections = Collections() checkpoint_saver = Saver( save_prefix="{0}.ckpt".format( os.path.join(flags.saveto, flags.model_name)), num_max_keeping=training_configs['num_kept_checkpoints']) best_model_prefix = os.path.join( flags.saveto, flags.model_name + Constants.MY_BEST_MODEL_SUFFIX) best_model_saver = Saver( save_prefix=best_model_prefix, num_max_keeping=training_configs['num_kept_best_model']) # 1. Build Model & Criterion INFO('Building model...') timer.tic() nmt_model = build_model(vocab_size=vocab_src.max_n_words, padding_idx=vocab_src.pad, vocab_src=vocab_src, **model_configs) INFO(nmt_model) # 损失函数 critic = torch.nn.CrossEntropyLoss(ignore_index=Constants.PAD) INFO(critic) # 2. Move to GPU if Constants.USE_GPU: nmt_model = nmt_model.cuda() critic = critic.cuda() # 3. Load pretrained model if needed load_pretrained_model(nmt_model, flags.pretrain_path, exclude_prefix=flags.pretrain_exclude_prefix, device=Constants.CURRENT_DEVICE) INFO('Done. Elapsed time {0}'.format(timer.toc())) # 4. Build optimizer INFO('Building Optimizer...') optimizer = torch.optim.Adam(nmt_model.parameters(), lr=optimizer_configs['learning_rate']) INFO('Done. Elapsed time {0}'.format(timer.toc())) # ================================================================================== # # Prepare training eidx = model_collections.get_collection("eidx", [0])[-1] uidx = model_collections.get_collection("uidx", [1])[-1] bad_count = model_collections.get_collection("bad_count", [0])[-1] oom_count = model_collections.get_collection("oom_count", [0])[-1] is_early_stop = model_collections.get_collection("is_early_stop", [ False, ])[-1] train_loss_meter = AverageMeter() sent_per_sec_meter = TimeMeter() tok_per_sec_meter = TimeMeter() grad_denom = 0 train_loss = 0.0 cum_n_words = 0 valid_loss = best_valid_loss = float('inf') if rank == 0: summary_writer = SummaryWriter(log_dir=flags.log_path) else: summary_writer = None sent_per_sec_meter.start() tok_per_sec_meter.start() INFO('Begin training...') while True: if summary_writer is not None: summary_writer.add_scalar("Epoch", (eidx + 1), uidx) # Build iterator and progress bar training_iter = training_iterator.build_generator() if rank == 0: training_progress_bar = tqdm(desc=' - (Epc {}, Upd {}) '.format( eidx, uidx), total=len(training_iterator), unit="sents") else: training_progress_bar = None for batch in training_iter: seqs_x = batch batch_size = len(seqs_x) cum_n_words = 0.0 train_loss = 0.0 try: # Prepare data grad_denom += batch_size x = prepare_data(seqs_x, seqs_y=None, cuda=Constants.USE_GPU) nmt_model.train() critic.train() critic.zero_grad() with torch.enable_grad(): logits = nmt_model(x[:-1]) logits = logits.view(-1, vocab_src.max_n_words) trg = x[1:] trg = trg.view(-1) loss = critic(logits, trg) loss.backward() optimizer.step() valid_token = (trg != Constants.PAD).long().sum().item() cum_n_words += valid_token train_loss += loss.item() * valid_token except RuntimeError as e: if 'out of memory' in str(e): print('| WARNING: ran out of memory, skipping batch') oom_count += 1 else: raise e # When update_cycle becomes 0, it means end of one batch. Several things will be done: # - update parameters # - reset update_cycle and grad_denom, update uidx # - learning rate scheduling # - update moving average if training_progress_bar is not None: training_progress_bar.update(grad_denom) training_progress_bar.set_description( ' - (Epc {}, Upd {}) '.format(eidx, uidx)) postfix_str = 'TrainLoss: {:.2f}, ValidLoss(best): {:.2f} ({:.2f}), '.format( train_loss / cum_n_words, valid_loss, best_valid_loss) training_progress_bar.set_postfix_str(postfix_str) # 4. update meters train_loss_meter.update(train_loss, cum_n_words) sent_per_sec_meter.update(grad_denom) tok_per_sec_meter.update(cum_n_words) # 5. reset accumulated variables, update uidx grad_denom = 0 uidx += 1 cum_n_words = 0.0 train_loss = 0.0 # ================================================================================== # # Display some information if should_trigger_by_steps( uidx, eidx, every_n_step=training_configs['disp_freq']): if summary_writer is not None: summary_writer.add_scalar( "Speed(sents/sec)", scalar_value=sent_per_sec_meter.ave, global_step=uidx) summary_writer.add_scalar( "Speed(words/sec)", scalar_value=tok_per_sec_meter.ave, global_step=uidx) summary_writer.add_scalar( "train_loss", scalar_value=train_loss_meter.ave, global_step=uidx) summary_writer.add_scalar("oom_count", scalar_value=oom_count, global_step=uidx) # Reset Meters sent_per_sec_meter.reset() tok_per_sec_meter.reset() train_loss_meter.reset() # ================================================================================== # # Loss Validation & Learning rate annealing if should_trigger_by_steps( global_step=uidx, n_epoch=eidx, every_n_step=training_configs['loss_valid_freq'], min_step=training_configs['bleu_valid_warmup'], debug=flags.debug): valid_iter = valid_iterator.build_generator() valid_loss = 0 total_tokens = 0 for batch in valid_iter: seq_number, seqs_x = batch x = prepare_data(seqs_x, seqs_y=None, cuda=Constants.USE_GPU) nmt_model.eval() critic.eval() with torch.no_grad(): logits = nmt_model(x[:-1]) logits = logits.view(-1, vocab_src.max_n_words) trg = x[1:] valid_token = (trg != Constants.PAD).sum(-1) batch_size, seq_len = trg.shape trg = trg.view(-1) # loss = critic(logits, trg) # valid_token = (trg != Constants.PAD).long().sum().item() # total_tokens += valid_token # valid_loss += loss.item() * valid_token import torch.nn.functional as F loss = F.cross_entropy(logits, trg, reduce=False, ignore_index=vocab_src.pad) loss = loss.view(batch_size, seq_len) loss = loss.sum(-1) print(seq_number) print(loss.double().div(valid_token.double())) exit(0) valid_loss = valid_loss / total_tokens model_collections.add_to_collection("history_losses", valid_loss) min_history_loss = np.array( model_collections.get_collection("history_losses")).min() best_valid_loss = min_history_loss if summary_writer is not None: summary_writer.add_scalar("loss", valid_loss, global_step=uidx) summary_writer.add_scalar("best_loss", min_history_loss, global_step=uidx) # If model get new best valid bleu score if valid_loss <= best_valid_loss: bad_count = 0 if is_early_stop is False: if rank == 0: # 1. save the best model torch.save(nmt_model.state_dict(), best_model_prefix + ".final") # 2. record all several best models best_model_saver.save( global_step=uidx, model=nmt_model, optimizer=optimizer, collections=model_collections) else: bad_count += 1 # At least one epoch should be traversed if bad_count >= training_configs[ 'early_stop_patience'] and eidx > 0: is_early_stop = True WARN("Early Stop!") exit(0) if summary_writer is not None: summary_writer.add_scalar("bad_count", bad_count, uidx) INFO("{0} Loss: {1:.2f} patience: {2}".format( uidx, valid_loss, bad_count)) # ================================================================================== # # # Saving checkpoints # if should_trigger_by_steps(uidx, eidx, every_n_step=training_configs['save_freq'], debug=flags.debug): # model_collections.add_to_collection("uidx", uidx) # model_collections.add_to_collection("eidx", eidx) # model_collections.add_to_collection("bad_count", bad_count) # # if not is_early_stop: # if rank == 0: # checkpoint_saver.save(global_step=uidx, # model=nmt_model, # optim=optimizer, # collections=model_collections) if training_progress_bar is not None: training_progress_bar.close() eidx += 1 if eidx > training_configs["max_epochs"]: break
def main(): opts = parse_args() init_logging(os.path.join(opts.log_dir, '{:s}_log.txt'.format(opts.task))) if torch.cuda.is_available(): torch.cuda.set_device(opts.gpu) logging.info("Using GPU!") device = "cuda" else: logging.info("Using CPU!") device = "cpu" logging.info(opts) test_datasets = PhoenixVideo(opts.vocab_file, opts.corpus_dir, opts.video_path, phase="train", DEBUG=opts.DEBUG, sample=False) vocab_size = test_datasets.vocab.num_words blank_id = test_datasets.vocab.word2index['<BLANK>'] pad_id = test_datasets.vocab.pad() vocabulary = Vocabulary(opts.vocab_file) # model = DilatedSLRNet(opts, device, vocab_size, vocabulary, # dilated_channels=512, num_blocks=5, dilations=[1, 2, 4], dropout=0.0) model = MainStream(vocab_size) criterion = CtcLoss(opts, blank_id, device, reduction="none") trainer = Trainer(opts, model, criterion, vocabulary, vocab_size, blank_id) # ctcdeocde ctc_decoder_vocab = [chr(x) for x in range(20000, 20000 + vocab_size)] ctc_decoder = ctcdecode.CTCBeamDecoder(ctc_decoder_vocab, beam_width=opts.beam_width, blank_id=blank_id, num_processes=10) if os.path.exists(opts.check_point): logging.info("Loading checkpoint file from {}".format( opts.check_point)) epoch, num_updates, loss = trainer.load_checkpoint(opts.check_point) else: logging.info("No checkpoint file in found in {}".format( opts.check_point)) epoch, num_updates, loss = 0, 0, 0.0 test_iter = trainer.get_batch_iterator(test_datasets, batch_size=opts.batch_size, shuffle=False) with torch.no_grad(): model.eval() criterion.eval() prob_results = {} for i, samples in enumerate(test_iter): if i > 500: break samples = trainer._prepare_sample(samples) video = samples["data"] len_video = samples["len_data"] label = samples["label"] len_label = samples["len_label"] video_id = samples['id'] dec_label = samples["decoder_label"] len_dec_label = samples["len_decoder_label"] # print("video: ", video.shape) logits, _ = model(video, len_video) len_video /= 4 # print("logits: ", logits.shape) # print(len_video) params = logits[0, :len_video[0], :].transpose( 1, 0).detach().cpu().numpy() # [T, vocab_size] seq = dec_label[0, :len_dec_label[0]].cpu().numpy() alignment = get_alignment(params, seq, blank=blank_id, is_prob=False) # [length] # print("video_id:", video_id[0]) # print("gt label:", seq) # print("alignment:", alignment) probs = logits.softmax(-1)[0] # [length ,vocab_size] align_probs = [] for i in range(alignment.shape[0]): align_probs.append( probs[i, alignment[i]].detach().cpu().numpy().tolist()) # print(align_probs) # exit() count = 0 total_cnt = 0 for i in range(len(align_probs)): total_cnt += 1 if alignment[i] == blank_id: align_probs[i] = 0 count += 1 print( "video_id: {}, and blank count / total count: {}/{} = {:.4f}". format(video_id[0], count, total_cnt, count / total_cnt)) prob_results[video_id[0]] = (align_probs, alignment) # print(align_probs) return prob_results
def train(FLAGS): """ FLAGS: saveto: str reload: store_true config_path: str pretrain_path: str, default="" model_name: str log_path: str """ # write log of training to file. write_log_to_file( os.path.join(FLAGS.log_path, "%s.log" % time.strftime("%Y%m%d-%H%M%S"))) GlobalNames.USE_GPU = FLAGS.use_gpu if GlobalNames.USE_GPU: CURRENT_DEVICE = "cpu" else: CURRENT_DEVICE = "cuda:0" config_path = os.path.abspath(FLAGS.config_path) with open(config_path.strip()) as f: configs = yaml.load(f) INFO(pretty_configs(configs)) # Add default configs configs = default_configs(configs) data_configs = configs['data_configs'] model_configs = configs['model_configs'] optimizer_configs = configs['optimizer_configs'] training_configs = configs['training_configs'] GlobalNames.SEED = training_configs['seed'] set_seed(GlobalNames.SEED) best_model_prefix = os.path.join( FLAGS.saveto, FLAGS.model_name + GlobalNames.MY_BEST_MODEL_SUFFIX) timer = Timer() # ================================================================================== # # Load Data INFO('Loading data...') timer.tic() # Generate target dictionary vocab_tgt = Vocabulary(**data_configs["vocabularies"][0]) train_batch_size = training_configs["batch_size"] * max( 1, training_configs["update_cycle"]) train_buffer_size = training_configs["buffer_size"] * max( 1, training_configs["update_cycle"]) train_bitext_dataset = ZipDataset(TextLineDataset( data_path=data_configs['train_data'][0], vocabulary=vocab_tgt, max_len=data_configs['max_len'][0], ), shuffle=training_configs['shuffle']) valid_bitext_dataset = ZipDataset( TextLineDataset( data_path=data_configs['valid_data'][0], vocabulary=vocab_tgt, )) training_iterator = DataIterator( dataset=train_bitext_dataset, batch_size=train_batch_size, use_bucket=training_configs['use_bucket'], buffer_size=train_buffer_size, batching_func=training_configs['batching_key']) valid_iterator = DataIterator( dataset=valid_bitext_dataset, batch_size=training_configs['valid_batch_size'], use_bucket=True, buffer_size=100000, numbering=True) INFO('Done. Elapsed time {0}'.format(timer.toc())) lrate = optimizer_configs['learning_rate'] is_early_stop = False # ================================ Begin ======================================== # # Build Model & Optimizer # We would do steps below on after another # 1. build models & criterion # 2. move models & criterion to gpu if needed # 3. load pre-trained model if needed # 4. build optimizer # 5. build learning rate scheduler if needed # 6. load checkpoints if needed # 0. Initial model_collections = Collections() checkpoint_saver = Saver( save_prefix="{0}.ckpt".format( os.path.join(FLAGS.saveto, FLAGS.model_name)), num_max_keeping=training_configs['num_kept_checkpoints']) best_model_saver = Saver( save_prefix=best_model_prefix, num_max_keeping=training_configs['num_kept_best_model']) # 1. Build Model & Criterion INFO('Building model...') timer.tic() lm_model = build_model(n_tgt_vocab=vocab_tgt.max_n_words, **model_configs) INFO(lm_model) params_total = sum([p.numel() for n, p in lm_model.named_parameters()]) params_with_embedding = sum([ p.numel() for n, p in lm_model.named_parameters() if n.find('embedding') == -1 ]) INFO('Total parameters: {}'.format(params_total)) INFO('Total parameters (excluding word embeddings): {}'.format( params_with_embedding)) critic = NMTCriterion(label_smoothing=model_configs['label_smoothing']) INFO(critic) INFO('Done. Elapsed time {0}'.format(timer.toc())) # 2. Move to GPU if GlobalNames.USE_GPU: lm_model = lm_model.cuda() critic = critic.cuda() # 3. Load pretrained model if needed lm_model.init_parameters(FLAGS.pretrain_path, device=CURRENT_DEVICE) # 4. Build optimizer INFO('Building Optimizer...') optim = Optimizer(name=optimizer_configs['optimizer'], model=lm_model, lr=lrate, grad_clip=optimizer_configs['grad_clip'], optim_args=optimizer_configs['optimizer_params']) # 5. Build scheduler for optimizer if needed if optimizer_configs['schedule_method'] is not None: if optimizer_configs['schedule_method'] == "loss": scheduler = ReduceOnPlateauScheduler( optimizer=optim, **optimizer_configs["scheduler_configs"]) elif optimizer_configs['schedule_method'] == "noam": scheduler = NoamScheduler(optimizer=optim, **optimizer_configs['scheduler_configs']) else: WARN( "Unknown scheduler name {0}. Do not use lr_scheduling.".format( optimizer_configs['schedule_method'])) scheduler = None else: scheduler = None # 6. build moving average if training_configs['moving_average_method'] is not None: ma = MovingAverage( moving_average_method=training_configs['moving_average_method'], named_params=lm_model.named_parameters(), alpha=training_configs['moving_average_alpha']) else: ma = None INFO('Done. Elapsed time {0}'.format(timer.toc())) # Reload from latest checkpoint if FLAGS.reload: checkpoint_saver.load_latest(model=lm_model, optim=optim, lr_scheduler=scheduler, collections=model_collections, ma=ma) # ================================================================================== # # Prepare training eidx = model_collections.get_collection("eidx", [0])[-1] uidx = model_collections.get_collection("uidx", [0])[-1] bad_count = model_collections.get_collection("bad_count", [0])[-1] oom_count = model_collections.get_collection("oom_count", [0])[-1] summary_writer = SummaryWriter(log_dir=FLAGS.log_path) cum_samples = 0 cum_words = 0 valid_loss = best_valid_loss = float('inf') # Max Float saving_files = [] # Timer for computing speed timer_for_speed = Timer() timer_for_speed.tic() INFO('Begin training...') while True: summary_writer.add_scalar("Epoch", (eidx + 1), uidx) # Build iterator and progress bar training_iter = training_iterator.build_generator() training_progress_bar = tqdm(desc=' - (Epc {}, Upd {}) '.format( eidx, uidx), total=len(training_iterator), unit="sents") for batch in training_iter: uidx += 1 if optimizer_configs[ "schedule_method"] is not None and optimizer_configs[ "schedule_method"] != "loss": scheduler.step(global_step=uidx) seqs_y = batch n_samples_t = len(seqs_y) n_words_t = sum(len(s) for s in seqs_y) cum_samples += n_samples_t cum_words += n_words_t train_loss = 0. optim.zero_grad() try: # Prepare data for (seqs_y_t, ) in split_shard( seqs_y, split_size=training_configs['update_cycle']): y = prepare_data(seqs_y_t, cuda=GlobalNames.USE_GPU) loss = compute_forward( model=lm_model, critic=critic, # seqs_x=x, seqs_y=y, eval=False, normalization=n_samples_t, norm_by_words=training_configs["norm_by_words"]) train_loss += loss / y.size( 1) if not training_configs["norm_by_words"] else loss optim.step() except RuntimeError as e: if 'out of memory' in str(e): print('| WARNING: ran out of memory, skipping batch') oom_count += 1 optim.zero_grad() else: raise e if ma is not None and eidx >= training_configs[ 'moving_average_start_epoch']: ma.step() training_progress_bar.update(n_samples_t) training_progress_bar.set_description( ' - (Epc {}, Upd {}) '.format(eidx, uidx)) training_progress_bar.set_postfix_str( 'TrainLoss: {:.2f}, ValidLoss(best): {:.2f} ({:.2f})'.format( train_loss, valid_loss, best_valid_loss)) summary_writer.add_scalar("train_loss", scalar_value=train_loss, global_step=uidx) # ================================================================================== # # Display some information if should_trigger_by_steps( uidx, eidx, every_n_step=training_configs['disp_freq']): # words per second and sents per second words_per_sec = cum_words / (timer.toc(return_seconds=True)) sents_per_sec = cum_samples / (timer.toc(return_seconds=True)) lrate = list(optim.get_lrate())[0] summary_writer.add_scalar("Speed(words/sec)", scalar_value=words_per_sec, global_step=uidx) summary_writer.add_scalar("Speed(sents/sen)", scalar_value=sents_per_sec, global_step=uidx) summary_writer.add_scalar("lrate", scalar_value=lrate, global_step=uidx) summary_writer.add_scalar("oom_count", scalar_value=oom_count, global_step=uidx) # Reset timer timer.tic() cum_words = 0 cum_samples = 0 # ================================================================================== # # Saving checkpoints if should_trigger_by_steps( uidx, eidx, every_n_step=training_configs['save_freq'], debug=FLAGS.debug): model_collections.add_to_collection("uidx", uidx) model_collections.add_to_collection("eidx", eidx) model_collections.add_to_collection("bad_count", bad_count) if not is_early_stop: checkpoint_saver.save(global_step=uidx, model=lm_model, optim=optim, lr_scheduler=scheduler, collections=model_collections, ma=ma) # ================================================================================== # # Loss Validation & Learning rate annealing if should_trigger_by_steps( global_step=uidx, n_epoch=eidx, every_n_step=training_configs['loss_valid_freq'], debug=FLAGS.debug): if ma is not None: origin_state_dict = deepcopy(lm_model.state_dict()) lm_model.load_state_dict(ma.export_ma_params(), strict=False) valid_loss = loss_validation( model=lm_model, critic=critic, valid_iterator=valid_iterator, norm_by_words=training_configs["norm_by_words"]) model_collections.add_to_collection("history_losses", valid_loss) min_history_loss = np.array( model_collections.get_collection("history_losses")).min() summary_writer.add_scalar("loss", valid_loss, global_step=uidx) summary_writer.add_scalar("best_loss", min_history_loss, global_step=uidx) if ma is not None: lm_model.load_state_dict(origin_state_dict) del origin_state_dict if optimizer_configs["schedule_method"] == "loss": scheduler.step(metric=best_valid_loss) # If model get new best valid loss if valid_loss < best_valid_loss: bad_count = 0 if is_early_stop is False: # 1. save the best model torch.save(lm_model.state_dict(), best_model_prefix + ".final") # 2. record all several best models best_model_saver.save(global_step=uidx, model=lm_model) else: bad_count += 1 # At least one epoch should be traversed if bad_count >= training_configs[ 'early_stop_patience'] and eidx > 0: is_early_stop = True WARN("Early Stop!") best_valid_loss = min_history_loss summary_writer.add_scalar("bad_count", bad_count, uidx) INFO("{0} Loss: {1:.2f} lrate: {2:6f} patience: {3}".format( uidx, valid_loss, lrate, bad_count)) training_progress_bar.close() eidx += 1 if eidx > training_configs["max_epochs"]: break
def interactive_FBS(FLAGS): patience = FLAGS.try_times GlobalNames.USE_GPU = FLAGS.use_gpu config_path = os.path.abspath(FLAGS.config_path) with open(config_path.strip()) as f: configs = yaml.load(f) data_configs = configs['data_configs'] model_configs = configs['model_configs'] timer = Timer() #=================================================================================== #load data INFO('loading data...') timer.tic() vocab_src = Vocabulary(**data_configs["vocabularies"][0]) vocab_tgt = Vocabulary(**data_configs["vocabularies"][1]) valid_dataset = TextLineDataset(data_path=FLAGS.source_path, vocabulary=vocab_src) valid_iterator = DataIterator(dataset=valid_dataset, batch_size=FLAGS.batch_size, use_bucket=True, buffer_size=100000, numbering=True) valid_ref = [] with open(FLAGS.ref_path) as f: for sent in f: valid_ref.append(vocab_tgt.sent2ids(sent)) INFO('Done. Elapsed time {0}'.format(timer.toc())) #=================================================================================== #build Model & Sampler & Validation INFO('Building model...') critic = NMTCriterion(label_smoothing=model_configs['label_smoothing']) INFO(critic) # 2. Move to GPU if GlobalNames.USE_GPU: critic = critic.cuda() timer.tic() fw_nmt_model = build_model(n_src_vocab=vocab_src.max_n_words, n_tgt_vocab=vocab_tgt.max_n_words, **model_configs) #bw_nmt_model = None bw_nmt_model = build_model(n_src_vocab=vocab_src.max_n_words, n_tgt_vocab=vocab_tgt.max_n_words, **model_configs) fw_nmt_model.eval() bw_nmt_model.eval() INFO('Done. Elapsed time {0}'.format(timer.toc())) INFO('Reloading model parameters...') timer.tic() fw_params = load_model_parameters(FLAGS.fw_model_path, map_location="cpu") bw_params = load_model_parameters(FLAGS.bw_model_path, map_location="cpu") fw_nmt_model.load_state_dict(fw_params) bw_nmt_model.load_state_dict(bw_params) if GlobalNames.USE_GPU: fw_nmt_model.cuda() bw_nmt_model.cuda() INFO('Done. Elapsed time {0}'.format(timer.toc())) INFO('begin...') timer.tic() result_numbers = [] result = [] n_words = 0 imt_numbers = [] imt_result = [] imt_n_words = 0 imt_constrains = [[] for ii in range(FLAGS.imt_step)] infer_progress_bar = tqdm(total=len(valid_iterator), desc=' - (Infer)', unit='sents') valid_iter = valid_iterator.build_generator() for batch in valid_iter: batch_result = [] batch_numbers = [] numbers, seqs_x = batch batch_size_t = len(seqs_x) x = prepare_data(seqs_x=seqs_x, cuda=GlobalNames.USE_GPU) with torch.no_grad(): word_ids = beam_search(nmt_model=fw_nmt_model, beam_size=FLAGS.beam_size, max_steps=FLAGS.max_steps, src_seqs=x, alpha=FLAGS.alpha) word_ids = word_ids.cpu().numpy().tolist() for sent_t in word_ids: sent_t = [[wid for wid in line if wid != PAD] for line in sent_t] result.append(sent_t) batch_result.append(sent_t[0]) n_words += len(sent_t[0]) result_numbers += numbers imt_numbers += numbers batch_numbers += numbers batch_ref = [valid_ref[ii] for ii in batch_numbers] last_sents = copy.deepcopy(batch_result) constrains = [[[] for ii in range(patience)] for jj in range(batch_size_t)] positions = [[[] for ii in range(patience)] for jj in range(batch_size_t)] for idx in range(FLAGS.imt_step): cons, pos = sample_constrains(last_sents, batch_ref, patience) for ii in range(batch_size_t): for jj in range(patience): constrains[ii][jj].append(cons[ii][jj]) positions[ii][jj].append(pos[ii][jj]) #print(positions) imt_constrains[idx].append([vocab_tgt.ids2sent(c) for c in cons]) bidirection = False if FLAGS.bidirection: bidirection = True with torch.no_grad(): constrained_word_ids, positions = fixwords_beam_search( fw_nmt_model=fw_nmt_model, bw_nmt_model=bw_nmt_model, beam_size=FLAGS.beam_size, max_steps=FLAGS.max_steps, src_seqs=x, alpha=FLAGS.alpha, constrains=constrains, positions=positions, last_sentences=last_sents, imt_step=idx + 1, bidirection=bidirection) constrained_word_ids = constrained_word_ids.cpu().numpy().tolist() last_sents = [] for i, sent_t in enumerate(constrained_word_ids): sent_t = [[wid for wid in line if wid != PAD] for line in sent_t] if idx == FLAGS.imt_step - 1: imt_result.append(copy.deepcopy(sent_t)) imt_n_words += len(sent_t[0]) samples = [] for trans in sent_t: sample = [] for w in trans: if w == vocab_tgt.EOS: break sample.append(w) samples.append(sample) sent_t = [] for ii in range(len(samples)): if ii % FLAGS.beam_size == 0: sent_t.append(samples[ii]) BLEU = [] for sample in sent_t: bleu, _ = bleuScore(sample, batch_ref[i]) BLEU.append(bleu) # print("BLEU: ", BLEU) order = np.argsort(BLEU).tolist() order = order[::-1] # print("order: ", order) sent_t = [sent_t[ii] for ii in order] last_sents.append(sent_t[0]) if FLAGS.online_learning and idx == FLAGS.imt_step - 1: seqs_y = [] for sent in last_sents: sent = [BOS] + sent seqs_y.append(sent) compute_forward(fw_nmt_model, critic, x, torch.Tensor(seqs_y).long().cuda()) seqs_y = [sent[::-1] for sent in seqs_y] for ii in range(len(seqs_y)): seqs_y[ii][0] = BOS seqs_y[ii][-1] = EOS compute_forward(bw_nmt_model, critic, x, torch.Tensor(seqs_y).long().cuda()) infer_progress_bar.update(batch_size_t) infer_progress_bar.close() INFO('Done. Speed: {0:.2f} words/sec'.format( n_words / (timer.toc(return_seconds=True)))) translation = [] for sent in result: samples = [] for trans in sent: sample = [] for w in trans: if w == vocab_tgt.EOS: break sample.append(vocab_tgt.id2token(w)) samples.append(vocab_tgt.tokenizer.detokenize(sample)) translation.append(samples) origin_order = np.argsort(result_numbers).tolist() translation = [translation[ii] for ii in origin_order] keep_n = FLAGS.beam_size if FLAGS.keep_n <= 0 else min( FLAGS.beam_size, FLAGS.keep_n) outputs = ['%s.%d' % (FLAGS.saveto, i) for i in range(keep_n)] with batch_open(outputs, 'w') as handles: for trans in translation: for i in range(keep_n): if i < len(trans): handles[i].write('%s\n' % trans[i]) else: handles[i].write('%s\n' % 'eos') imt_translation = [] for sent in imt_result: samples = [] for trans in sent: sample = [] for w in trans: if w == vocab_tgt.EOS: break sample.append(w) samples.append(sample) imt_translation.append(samples) origin_order = np.argsort(imt_numbers).tolist() imt_translation = [imt_translation[ii] for ii in origin_order] for idx in range(FLAGS.imt_step): imt_constrains[idx] = [ ' '.join(imt_constrains[idx][ii]) + '\n' for ii in origin_order ] with open('%s.cons%d' % (FLAGS.saveto, idx), 'w') as f: f.writelines(imt_constrains[idx]) bleu_translation = [] for idx, sent in enumerate(imt_translation): samples = [] for ii in range(len(sent)): if ii % FLAGS.beam_size == 0: samples.append(sent[ii]) BLEU = [] for sample in samples: bleu, _ = bleuScore(sample, valid_ref[idx]) BLEU.append(bleu) #print("BLEU: ", BLEU) order = np.argsort(BLEU).tolist() order = order[::-1] #print("order: ", order) samples = [vocab_tgt.ids2sent(samples[ii]) for ii in order] bleu_translation.append(samples) #keep_n = FLAGS.beam_size*patience if FLAGS.keep_n <= 0 else min(FLAGS.beam_size*patience, FLAGS.keep_n) keep_n = patience outputs = ['%s.imt%d' % (FLAGS.saveto, i) for i in range(keep_n)] with batch_open(outputs, 'w') as handles: for trans in bleu_translation: for i in range(keep_n): if i < len(trans): handles[i].write('%s\n' % trans[i]) else: handles[i].write('%s\n' % 'eos')