def train(params): assert params["mode"].lower() == "train", "change training mode to 'train'" print("Creating the vocab from :", params["vocab_path"]) vocab = Vocab(params["vocab_path"], params["vocab_size"]) print("Creating the embedding_matrix from:", params["vector_path"]) embeddings_matrix = get_embedding(params["vocab_size"], params["embed_size"], vocab, params['vector_path']) tf.compat.v1.logging.info("Building the model ...") model = PGN(params, embeddings_matrix) print("Creating the batcher ...") b = batcher(params["data_dir"], vocab, params) print("Creating the checkpoint manager") checkpoint_dir = "{}".format(params["checkpoint_dir"]) ckpt = tf.train.Checkpoint(step=tf.Variable(0), PGN=model) ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_dir, max_to_keep=11) ckpt.restore(ckpt_manager.latest_checkpoint) if ckpt_manager.latest_checkpoint: print("Restored from {}".format(ckpt_manager.latest_checkpoint)) else: print("Initializing from scratch.") tf.compat.v1.logging.info("Starting the training ...") train_model(model, b, params, ckpt, ckpt_manager, "output.txt")
def train(params): assert params["mode"].lower() == "train", "change training mode to 'train'" tf.compat.v1.logging.info("Building the model ...") model = PGN(params) tf.compat.v1.logging.info("Creating the batcher ...") b = batcher(params["data_dir"], params["vocab_path"], params) tf.compat.v1.logging.info("Creating the checkpoint manager") logdir = "{}/logdir".format(params["model_dir"]) checkpoint_dir = "{}/checkpoint".format(params["model_dir"]) ckpt = tf.train.Checkpoint(step=tf.Variable(0), PGN=model) ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_dir, max_to_keep=11) ckpt.restore(ckpt_manager.latest_checkpoint) if ckpt_manager.latest_checkpoint: print("Restored from {}".format(ckpt_manager.latest_checkpoint)) else: print("Initializing from scratch.") tf.compat.v1.logging.info("Starting the training ...") train_model(model, b, params, ckpt, ckpt_manager)
def test(params): assert params["mode"].lower() in [ "test", "eval" ], "change training mode to 'test' or 'eval'" print(params["beam_size"], params["batch_size"]) assert params["beam_size"] == params[ "batch_size"], "Beam size must be equal to batch_size, change the params" print("Creating the vocab ...") vocab = Vocab(params["vocab_path"], params["vocab_size"]) embeddings_matrix = get_embedding(params["vocab_size"], params["embed_size"], vocab, params['vector_path']) tf.compat.v1.logging.info("Building the model ...") model = PGN(params, embeddings_matrix) print("Creating the batcher ...") b = batcher(params["data_dir"], vocab, params) print("Creating the checkpoint manager") checkpoint_dir = "{}".format(params["checkpoint_dir"]) ckpt = tf.train.Checkpoint(step=tf.Variable(0), PGN=model) ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_dir, max_to_keep=11) path = params["model_path"] if params[ "model_path"] else ckpt_manager.latest_checkpoint ckpt.restore(path) print("Model restored") for batch in b: yield beam_decode(model, batch, vocab, params)
def __init__(self, env): self.env = env self.num_obs = env.observation_space.shape[0] self.num_actions = env.action_space.n self.network = PGN(self.num_obs, self.num_actions) self.gamma = 0.99 self.lr = 1e-3 self.train_episodes = 4 self.optimizer = tf.keras.optimizers.Adam(lr=self.lr) self.print_every = 10
def __init__(self): self.DEVICE = config.DEVICE dataset = PairDataset(config.data_path, max_src_len=config.max_src_len, max_tgt_len=config.max_tgt_len, truncate_src=config.truncate_src, truncate_tgt=config.truncate_tgt) self.vocab = dataset.build_vocab(embed_file=config.embed_file) self.model = PGN(self.vocab) self.stop_word = list( set([ self.vocab[x.strip()] for x in open( config.stop_word_file, encoding='utf-8').readlines() ])) self.model.load_model() self.model.to(self.DEVICE)
def main(args): if not os.path.exists(args.model_path): os.makedirs(args.model_path) train_set = TrainImageFolder(args.train_dir) data_loader = torch.utils.data.DataLoader(train_set, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) model = nn.DataParallel(PGN()).cuda() criterion = nn.CrossEntropyLoss(reduce=False).cuda() params = list(model.parameters()) total_step = len(data_loader) for epoch in range(134, args.num_epochs): lr_ = lr_poly(args.learning_rate, epoch * total_step, args.num_epochs * total_step, 0.9) optimizer = torch.optim.SGD(params, lr=lr_, momentum=args.momentum, weight_decay=args.weight_decay) for i, (images, parse) in enumerate(data_loader): images = images.cuda() parse = parse.long().cuda() parsing_out1, parsing_out2, edge_out1_final, edge_out_res5, edge_out_res4, edge_out_res3, edge_out2_final = model( images) #parsing_out1=model(images) loss = criterion(parsing_out1, parse).mean() model.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format( epoch, args.num_epochs, i, total_step, loss.item())) if (i + 1) % args.save_step == 0: torch.save( model.state_dict(), os.path.join(args.model_path, 'model-{}-{}.ckpt'.format(epoch + 1, i + 1)))
def predict(): model = nn.DataParallel(PGN()).cuda() #model.load_state_dict(torch.load('models/model-134-2539.ckpt')) data_dir = 'LIP/testing_images' dirs = os.listdir(data_dir) for file in dirs: image = Image.open(data_dir + '/' + file).convert('RGB') a, b = image.size[0], image.size[1] image = torch.Tensor( np.array(image).astype(np.float32).transpose( (2, 0, 1))).unsqueeze(0).cuda() #pre_image=Image.fromarray(model(image).cpu().detach().numpy()[0]) c = Image.fromarray( np.argmax(model(image).cpu().detach().numpy()[0], axis=0).astype(np.uint8)) c = c.resize((a, b), Image.NEAREST) #print(np.array(c)) # save_image=pre_image.resize((a,b),Image.NEAREST) c.save('LIP/test_save/' + file[:-4] + '.png', quality=95, subsampling=0)
def __init__(self): self.DEVICE = config.device # self.dataset = SamplesDataset(config.train_data_path) # self.vocab = self.dataset.vocab self.vocab = None if (os.path.exists(config.vocab)): with open(config.vocab, 'rb') as f: self.vocab = pickle.load(f) self.dataset = SamplesDataset(config.train_data_path, vocab=self.vocab) self.vocab = self.dataset.vocab self.model = PGN(self.vocab) self.stop_word = list( set([ self.vocab[x.strip()] for x in open(config.stop_word_file).readlines() ])) self.model.load_model() self.model.to(self.DEVICE)
def train(dataset, val_dataset, v, start_epoch=0): """Train the model, evaluate it and store it. Args: dataset (dataset.PairDataset): The training dataset. val_dataset (dataset.PairDataset): The evaluation dataset. v (vocab.Vocab): The vocabulary built from the training dataset. start_epoch (int, optional): The starting epoch number. Defaults to 0. """ DEVICE = torch.device("cuda" if config.is_cuda else "cpu") model = PGN(v) model.load_model() model.to(DEVICE) if config.fine_tune: # In fine-tuning mode, we fix the weights of all parameters except attention.wc. print('Fine-tuning mode.') for name, params in model.named_parameters(): if name != 'attention.wc.weight': params.requires_grad = False # forward print("loading data") train_data = SampleDataset(dataset.pairs, v) val_data = SampleDataset(val_dataset.pairs, v) print("initializing optimizer") # Define the optimizer. optimizer = optim.Adam(model.parameters(), lr=config.learning_rate) train_dataloader = DataLoader(dataset=train_data, batch_size=config.batch_size, shuffle=True, collate_fn=collate_fn) val_losses = np.inf if (os.path.exists(config.losses_path)): with open(config.losses_path, 'rb') as f: val_losses = pickle.load(f) # torch.cuda.empty_cache() # SummaryWriter: Log writer used for TensorboardX visualization. writer = SummaryWriter(config.log_path) # tqdm: A tool for drawing progress bars during training. # scheduled_sampler : A tool for choosing teacher_forcing or not num_epochs = len(range(start_epoch, config.epochs)) scheduled_sampler = ScheduledSampler(num_epochs) if config.scheduled_sampling: print('scheduled_sampling mode.') # teacher_forcing = True with tqdm(total=config.epochs) as epoch_progress: for epoch in range(start_epoch, config.epochs): print(config_info(config)) batch_losses = [] # Get loss of each batch. num_batches = len(train_dataloader) # set a teacher_forcing signal if config.scheduled_sampling: teacher_forcing = scheduled_sampler.teacher_forcing( epoch - start_epoch) else: teacher_forcing = True print('teacher_forcing = {}'.format(teacher_forcing)) with tqdm(total=num_batches) as batch_progress: for batch, data in enumerate(tqdm(train_dataloader)): x, y, x_len, y_len, oov, len_oovs = data assert not np.any(np.isnan(x.numpy())) if config.is_cuda: # Training with GPUs. x = x.to(DEVICE) y = y.to(DEVICE) x_len = x_len.to(DEVICE) len_oovs = len_oovs.to(DEVICE) model.train() # Sets the module in training mode. optimizer.zero_grad() # Clear gradients. # Calculate loss. Call model forward propagation loss = model(x, x_len, y, len_oovs, batch=batch, num_batches=num_batches, teacher_forcing=teacher_forcing) batch_losses.append(loss.item()) loss.backward() # Backpropagation. # Do gradient clipping to prevent gradient explosion. clip_grad_norm_(model.encoder.parameters(), config.max_grad_norm) clip_grad_norm_(model.decoder.parameters(), config.max_grad_norm) clip_grad_norm_(model.attention.parameters(), config.max_grad_norm) optimizer.step() # Update weights. # Output and record epoch loss every 100 batches. if (batch % 32) == 0: batch_progress.set_description(f'Epoch {epoch}') batch_progress.set_postfix(Batch=batch, Loss=loss.item()) batch_progress.update() # Write loss for tensorboard. writer.add_scalar(f'Average loss for epoch {epoch}', np.mean(batch_losses), global_step=batch) # Calculate average loss over all batches in an epoch. epoch_loss = np.mean(batch_losses) epoch_progress.set_description(f'Epoch {epoch}') epoch_progress.set_postfix(Loss=epoch_loss) epoch_progress.update() avg_val_loss = evaluate(model, val_data, epoch) print('training loss:{}'.format(epoch_loss), 'validation loss:{}'.format(avg_val_loss)) # Update minimum evaluating loss. if (avg_val_loss < val_losses): torch.save(model.encoder, config.encoder_save_name) torch.save(model.decoder, config.decoder_save_name) torch.save(model.attention, config.attention_save_name) torch.save(model.reduce_state, config.reduce_state_save_name) val_losses = avg_val_loss with open(config.losses_path, 'wb') as f: pickle.dump(val_losses, f) writer.close()
def train(dataset, val_dataset, v, start_epoch=0): """Train the model, evaluate it and store it. Args: dataset (dataset.PairDataset): The training dataset. val_dataset (dataset.PairDataset): The evaluation dataset. v (vocab.Vocab): The vocabulary built from the training dataset. start_epoch (int, optional): The starting epoch number. Defaults to 0. """ torch.autograd.set_detect_anomaly(True) DEVICE = torch.device("cuda" if config.is_cuda else "cpu") model = PGN(v) model.load_model() model.to(DEVICE) if config.fine_tune: # In fine-tuning mode, we fix the weights of all parameters except attention.wc. logging.info('Fine-tuning mode.') for name, params in model.named_parameters(): if name != 'attention.wc.weight': params.requires_grad = False # forward logging.info("loading data") train_data = dataset val_data = val_dataset logging.info("initializing optimizer") # Define the optimizer. # optimizer = optim.Adam(model.parameters(), lr=config.learning_rate) optimizer = optim.Adagrad( model.parameters(), lr=config.learning_rate, initial_accumulator_value=config.initial_accumulator_value) scheduler = StepLR(optimizer, step_size=10, gamma=0.2) # 学习率调整 train_dataloader = DataLoader(dataset=train_data, batch_size=config.batch_size, shuffle=True, collate_fn=collate_fn) val_loss = np.inf if (os.path.exists(config.losses_path)): with open(config.losses_path, 'r') as f: val_loss = float(f.readlines()[-1].split("=")[-1]) logging.info("the last best val loss is: " + str(val_loss)) # torch.cuda.empty_cache() # SummaryWriter: Log writer used for TensorboardX visualization. writer = SummaryWriter(config.log_path) # tqdm: A tool for drawing progress bars during training. early_stopping_count = 0 logging.info("start training model {}, ".format(config.model_name) + \ "epoch : {}, ".format(config.epochs) + "batch_size : {}, ".format(config.batch_size) + "num batches: {}, ".format(len(train_dataloader))) for epoch in range(start_epoch, config.epochs): batch_losses = [] # Get loss of each batch. num_batches = len(train_dataloader) # with tqdm(total=num_batches//100) as batch_progress: for batch, data in enumerate(train_dataloader): x, y, x_len, y_len, oov, len_oovs, img_vec = data assert not np.any(np.isnan(x.numpy())) if config.is_cuda: # Training with GPUs. x = x.to(DEVICE) y = y.to(DEVICE) x_len = x_len.to(DEVICE) len_oovs = len_oovs.to(DEVICE) img_vec = img_vec.to(DEVICE) if batch == 0: logging.info("x: %s, shape: %s" % (x, x.shape)) logging.info("y: %s, shape: %s" % (y, y.shape)) logging.info("oov: %s" % oov) logging.info("img_vec: %s, shape: %s" % (img_vec, img_vec.shape)) model.train() # Sets the module in training mode. optimizer.zero_grad() # Clear gradients. loss = model(x, y, len_oovs, img_vec, batch=batch, num_batches=num_batches) batch_losses.append(loss.item()) loss.backward() # Backpropagation. # Do gradient clipping to prevent gradient explosion. clip_grad_norm_(model.encoder.parameters(), config.max_grad_norm) clip_grad_norm_(model.decoder.parameters(), config.max_grad_norm) clip_grad_norm_(model.attention.parameters(), config.max_grad_norm) clip_grad_norm_(model.reduce_state.parameters(), config.max_grad_norm) optimizer.step() # Update weights. # scheduler.step() # # Output and record epoch loss every 100 batches. if (batch % 100) == 0: # batch_progress.set_description(f'Epoch {epoch}') # batch_progress.set_postfix(Batch=batch, # Loss=loss.item()) # batch_progress.update() # # Write loss for tensorboard. writer.add_scalar(f'Average_loss_for_epoch_{epoch}', np.mean(batch_losses), global_step=batch) logging.info('epoch: {}, batch:{}, training loss:{}'.format( epoch, batch, np.mean(batch_losses))) # Calculate average loss over all batches in an epoch. epoch_loss = np.mean(batch_losses) # epoch_progress.set_description(f'Epoch {epoch}') # epoch_progress.set_postfix(Loss=epoch_loss) # epoch_progress.update() avg_val_loss = evaluate(model, val_data, epoch) logging.info('epoch: {} '.format(epoch) + 'training loss:{} '.format(epoch_loss) + 'validation loss:{} '.format(avg_val_loss)) # Update minimum evaluating loss. if not os.path.exists(os.path.dirname(config.encoder_save_name)): os.mkdir(os.path.dirname(config.encoder_save_name)) if (avg_val_loss < val_loss): logging.info("saving model to ../saved_model/ %s" % config.model_name) torch.save(model.encoder, config.encoder_save_name) torch.save(model.decoder, config.decoder_save_name) torch.save(model.attention, config.attention_save_name) torch.save(model.reduce_state, config.reduce_state_save_name) val_loss = avg_val_loss with open(config.losses_path, 'a') as f: f.write(f"best val loss={val_loss}\n") else: early_stopping_count += 1 if early_stopping_count >= config.patience: logging.info( f'Validation loss did not decrease for {config.patience} epochs, stop training.' ) break writer.close()
(val_extended_input_tokens, val_extended_gt_tokens, val_loss_mask, val_index)).batch(int(global_batch_size)) val_dist_dataset = train_strategy.experimental_distribute_dataset( val_tf_dataset) max_oovs_in_text = max(0, np.max(extended_input_tokens) - vocab.size() + 1, np.max(val_extended_input_tokens) - vocab.size() + 1) print('Max oovs in text :', max_oovs_in_text) ################################################################################################# # Создаем модель и слои ошибок, определяем функцию для распределенного обучения ################################################################################################# with train_strategy.scope(): model = PGN(vocab=vocab, max_oovs_in_text=max_oovs_in_text) optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.98, epsilon=1e-9) ce_loss = CELoss(alpha=1.) # def train_step(inputs): def pretrain_step(extended_input_tokens, extended_gt_tokens, loss_mask, idx): model.switch_decoding_mode('cross_entropy') with tf.GradientTape() as tape: gt_probs, greedy_seqs, coverage_losses = model(extended_input_tokens, extended_gt_tokens, training=True)