def train(params):
    assert params["mode"].lower() == "train", "change training mode to 'train'"

    print("Creating the vocab from :", params["vocab_path"])
    vocab = Vocab(params["vocab_path"], params["vocab_size"])

    print("Creating the embedding_matrix from:", params["vector_path"])
    embeddings_matrix = get_embedding(params["vocab_size"],
                                      params["embed_size"], vocab,
                                      params['vector_path'])

    tf.compat.v1.logging.info("Building the model ...")
    model = PGN(params, embeddings_matrix)

    print("Creating the batcher ...")
    b = batcher(params["data_dir"], vocab, params)

    print("Creating the checkpoint manager")
    checkpoint_dir = "{}".format(params["checkpoint_dir"])
    ckpt = tf.train.Checkpoint(step=tf.Variable(0), PGN=model)
    ckpt_manager = tf.train.CheckpointManager(ckpt,
                                              checkpoint_dir,
                                              max_to_keep=11)

    ckpt.restore(ckpt_manager.latest_checkpoint)
    if ckpt_manager.latest_checkpoint:
        print("Restored from {}".format(ckpt_manager.latest_checkpoint))
    else:
        print("Initializing from scratch.")
    tf.compat.v1.logging.info("Starting the training ...")
    train_model(model, b, params, ckpt, ckpt_manager, "output.txt")
def train(params):
    assert params["mode"].lower() == "train", "change training mode to 'train'"

    tf.compat.v1.logging.info("Building the model ...")
    model = PGN(params)

    tf.compat.v1.logging.info("Creating the batcher ...")
    b = batcher(params["data_dir"], params["vocab_path"], params)

    tf.compat.v1.logging.info("Creating the checkpoint manager")
    logdir = "{}/logdir".format(params["model_dir"])
    checkpoint_dir = "{}/checkpoint".format(params["model_dir"])
    ckpt = tf.train.Checkpoint(step=tf.Variable(0), PGN=model)
    ckpt_manager = tf.train.CheckpointManager(ckpt,
                                              checkpoint_dir,
                                              max_to_keep=11)

    ckpt.restore(ckpt_manager.latest_checkpoint)
    if ckpt_manager.latest_checkpoint:
        print("Restored from {}".format(ckpt_manager.latest_checkpoint))
    else:
        print("Initializing from scratch.")

    tf.compat.v1.logging.info("Starting the training ...")
    train_model(model, b, params, ckpt, ckpt_manager)
def test(params):
    assert params["mode"].lower() in [
        "test", "eval"
    ], "change training mode to 'test' or 'eval'"
    print(params["beam_size"], params["batch_size"])
    assert params["beam_size"] == params[
        "batch_size"], "Beam size must be equal to batch_size, change the params"

    print("Creating the vocab ...")
    vocab = Vocab(params["vocab_path"], params["vocab_size"])

    embeddings_matrix = get_embedding(params["vocab_size"],
                                      params["embed_size"], vocab,
                                      params['vector_path'])

    tf.compat.v1.logging.info("Building the model ...")
    model = PGN(params, embeddings_matrix)

    print("Creating the batcher ...")
    b = batcher(params["data_dir"], vocab, params)

    print("Creating the checkpoint manager")
    checkpoint_dir = "{}".format(params["checkpoint_dir"])
    ckpt = tf.train.Checkpoint(step=tf.Variable(0), PGN=model)
    ckpt_manager = tf.train.CheckpointManager(ckpt,
                                              checkpoint_dir,
                                              max_to_keep=11)

    path = params["model_path"] if params[
        "model_path"] else ckpt_manager.latest_checkpoint
    ckpt.restore(path)
    print("Model restored")

    for batch in b:
        yield beam_decode(model, batch, vocab, params)
Beispiel #4
0
 def __init__(self, env):
     self.env = env
     self.num_obs = env.observation_space.shape[0]
     self.num_actions = env.action_space.n
     self.network = PGN(self.num_obs, self.num_actions)
     self.gamma = 0.99
     self.lr = 1e-3
     self.train_episodes = 4
     self.optimizer = tf.keras.optimizers.Adam(lr=self.lr)
     self.print_every = 10
    def __init__(self):
        self.DEVICE = config.DEVICE

        dataset = PairDataset(config.data_path,
                              max_src_len=config.max_src_len,
                              max_tgt_len=config.max_tgt_len,
                              truncate_src=config.truncate_src,
                              truncate_tgt=config.truncate_tgt)

        self.vocab = dataset.build_vocab(embed_file=config.embed_file)

        self.model = PGN(self.vocab)
        self.stop_word = list(
            set([
                self.vocab[x.strip()] for x in open(
                    config.stop_word_file, encoding='utf-8').readlines()
            ]))
        self.model.load_model()
        self.model.to(self.DEVICE)
Beispiel #6
0
def main(args):
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)
    train_set = TrainImageFolder(args.train_dir)
    data_loader = torch.utils.data.DataLoader(train_set,
                                              batch_size=args.batch_size,
                                              shuffle=True,
                                              num_workers=args.num_workers)
    model = nn.DataParallel(PGN()).cuda()
    criterion = nn.CrossEntropyLoss(reduce=False).cuda()
    params = list(model.parameters())

    total_step = len(data_loader)
    for epoch in range(134, args.num_epochs):
        lr_ = lr_poly(args.learning_rate, epoch * total_step,
                      args.num_epochs * total_step, 0.9)
        optimizer = torch.optim.SGD(params,
                                    lr=lr_,
                                    momentum=args.momentum,
                                    weight_decay=args.weight_decay)
        for i, (images, parse) in enumerate(data_loader):
            images = images.cuda()
            parse = parse.long().cuda()
            parsing_out1, parsing_out2, edge_out1_final, edge_out_res5, edge_out_res4, edge_out_res3, edge_out2_final = model(
                images)
            #parsing_out1=model(images)
            loss = criterion(parsing_out1, parse).mean()
            model.zero_grad()

            loss.backward()
            optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(
                    epoch, args.num_epochs, i, total_step, loss.item()))
            if (i + 1) % args.save_step == 0:
                torch.save(
                    model.state_dict(),
                    os.path.join(args.model_path,
                                 'model-{}-{}.ckpt'.format(epoch + 1, i + 1)))
Beispiel #7
0
def predict():
    model = nn.DataParallel(PGN()).cuda()
    #model.load_state_dict(torch.load('models/model-134-2539.ckpt'))
    data_dir = 'LIP/testing_images'
    dirs = os.listdir(data_dir)
    for file in dirs:
        image = Image.open(data_dir + '/' + file).convert('RGB')
        a, b = image.size[0], image.size[1]
        image = torch.Tensor(
            np.array(image).astype(np.float32).transpose(
                (2, 0, 1))).unsqueeze(0).cuda()
        #pre_image=Image.fromarray(model(image).cpu().detach().numpy()[0])
        c = Image.fromarray(
            np.argmax(model(image).cpu().detach().numpy()[0],
                      axis=0).astype(np.uint8))
        c = c.resize((a, b), Image.NEAREST)
        #print(np.array(c))

        # save_image=pre_image.resize((a,b),Image.NEAREST)
        c.save('LIP/test_save/' + file[:-4] + '.png',
               quality=95,
               subsampling=0)
    def __init__(self):
        self.DEVICE = config.device

        #         self.dataset = SamplesDataset(config.train_data_path)

        #         self.vocab = self.dataset.vocab
        self.vocab = None
        if (os.path.exists(config.vocab)):
            with open(config.vocab, 'rb') as f:
                self.vocab = pickle.load(f)

        self.dataset = SamplesDataset(config.train_data_path, vocab=self.vocab)
        self.vocab = self.dataset.vocab

        self.model = PGN(self.vocab)
        self.stop_word = list(
            set([
                self.vocab[x.strip()]
                for x in open(config.stop_word_file).readlines()
            ]))
        self.model.load_model()
        self.model.to(self.DEVICE)
Beispiel #9
0
def train(dataset, val_dataset, v, start_epoch=0):
    """Train the model, evaluate it and store it.

    Args:
        dataset (dataset.PairDataset): The training dataset.
        val_dataset (dataset.PairDataset): The evaluation dataset.
        v (vocab.Vocab): The vocabulary built from the training dataset.
        start_epoch (int, optional): The starting epoch number. Defaults to 0.
    """

    DEVICE = torch.device("cuda" if config.is_cuda else "cpu")

    model = PGN(v)
    model.load_model()
    model.to(DEVICE)
    if config.fine_tune:
        # In fine-tuning mode, we fix the weights of all parameters except attention.wc.
        print('Fine-tuning mode.')
        for name, params in model.named_parameters():
            if name != 'attention.wc.weight':
                params.requires_grad = False
    # forward
    print("loading data")
    train_data = SampleDataset(dataset.pairs, v)
    val_data = SampleDataset(val_dataset.pairs, v)

    print("initializing optimizer")

    # Define the optimizer.
    optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)
    train_dataloader = DataLoader(dataset=train_data,
                                  batch_size=config.batch_size,
                                  shuffle=True,
                                  collate_fn=collate_fn)

    val_losses = np.inf
    if (os.path.exists(config.losses_path)):
        with open(config.losses_path, 'rb') as f:
            val_losses = pickle.load(f)


#     torch.cuda.empty_cache()
# SummaryWriter: Log writer used for TensorboardX visualization.
    writer = SummaryWriter(config.log_path)
    # tqdm: A tool for drawing progress bars during training.
    # scheduled_sampler : A tool for choosing teacher_forcing or not
    num_epochs = len(range(start_epoch, config.epochs))
    scheduled_sampler = ScheduledSampler(num_epochs)
    if config.scheduled_sampling:
        print('scheduled_sampling mode.')
    #  teacher_forcing = True

    with tqdm(total=config.epochs) as epoch_progress:
        for epoch in range(start_epoch, config.epochs):
            print(config_info(config))
            batch_losses = []  # Get loss of each batch.
            num_batches = len(train_dataloader)
            # set a teacher_forcing signal
            if config.scheduled_sampling:
                teacher_forcing = scheduled_sampler.teacher_forcing(
                    epoch - start_epoch)
            else:
                teacher_forcing = True
            print('teacher_forcing = {}'.format(teacher_forcing))
            with tqdm(total=num_batches) as batch_progress:
                for batch, data in enumerate(tqdm(train_dataloader)):
                    x, y, x_len, y_len, oov, len_oovs = data
                    assert not np.any(np.isnan(x.numpy()))
                    if config.is_cuda:  # Training with GPUs.
                        x = x.to(DEVICE)
                        y = y.to(DEVICE)
                        x_len = x_len.to(DEVICE)
                        len_oovs = len_oovs.to(DEVICE)

                    model.train()  # Sets the module in training mode.
                    optimizer.zero_grad()  # Clear gradients.
                    # Calculate loss.  Call model forward propagation
                    loss = model(x,
                                 x_len,
                                 y,
                                 len_oovs,
                                 batch=batch,
                                 num_batches=num_batches,
                                 teacher_forcing=teacher_forcing)
                    batch_losses.append(loss.item())
                    loss.backward()  # Backpropagation.

                    # Do gradient clipping to prevent gradient explosion.
                    clip_grad_norm_(model.encoder.parameters(),
                                    config.max_grad_norm)
                    clip_grad_norm_(model.decoder.parameters(),
                                    config.max_grad_norm)
                    clip_grad_norm_(model.attention.parameters(),
                                    config.max_grad_norm)
                    optimizer.step()  # Update weights.

                    # Output and record epoch loss every 100 batches.
                    if (batch % 32) == 0:
                        batch_progress.set_description(f'Epoch {epoch}')
                        batch_progress.set_postfix(Batch=batch,
                                                   Loss=loss.item())
                        batch_progress.update()
                        # Write loss for tensorboard.
                        writer.add_scalar(f'Average loss for epoch {epoch}',
                                          np.mean(batch_losses),
                                          global_step=batch)
            # Calculate average loss over all batches in an epoch.
            epoch_loss = np.mean(batch_losses)

            epoch_progress.set_description(f'Epoch {epoch}')
            epoch_progress.set_postfix(Loss=epoch_loss)
            epoch_progress.update()

            avg_val_loss = evaluate(model, val_data, epoch)

            print('training loss:{}'.format(epoch_loss),
                  'validation loss:{}'.format(avg_val_loss))

            # Update minimum evaluating loss.
            if (avg_val_loss < val_losses):
                torch.save(model.encoder, config.encoder_save_name)
                torch.save(model.decoder, config.decoder_save_name)
                torch.save(model.attention, config.attention_save_name)
                torch.save(model.reduce_state, config.reduce_state_save_name)
                val_losses = avg_val_loss
            with open(config.losses_path, 'wb') as f:
                pickle.dump(val_losses, f)

    writer.close()
def train(dataset, val_dataset, v, start_epoch=0):
    """Train the model, evaluate it and store it.

    Args:
        dataset (dataset.PairDataset): The training dataset.
        val_dataset (dataset.PairDataset): The evaluation dataset.
        v (vocab.Vocab): The vocabulary built from the training dataset.
        start_epoch (int, optional): The starting epoch number. Defaults to 0.
    """
    torch.autograd.set_detect_anomaly(True)
    DEVICE = torch.device("cuda" if config.is_cuda else "cpu")

    model = PGN(v)
    model.load_model()
    model.to(DEVICE)
    if config.fine_tune:
        # In fine-tuning mode, we fix the weights of all parameters except attention.wc.
        logging.info('Fine-tuning mode.')
        for name, params in model.named_parameters():
            if name != 'attention.wc.weight':
                params.requires_grad = False
    # forward
    logging.info("loading data")
    train_data = dataset
    val_data = val_dataset

    logging.info("initializing optimizer")

    # Define the optimizer.
    #     optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)
    optimizer = optim.Adagrad(
        model.parameters(),
        lr=config.learning_rate,
        initial_accumulator_value=config.initial_accumulator_value)
    scheduler = StepLR(optimizer, step_size=10, gamma=0.2)  # 学习率调整
    train_dataloader = DataLoader(dataset=train_data,
                                  batch_size=config.batch_size,
                                  shuffle=True,
                                  collate_fn=collate_fn)

    val_loss = np.inf
    if (os.path.exists(config.losses_path)):
        with open(config.losses_path, 'r') as f:
            val_loss = float(f.readlines()[-1].split("=")[-1])
            logging.info("the last best val loss is: " + str(val_loss))


#     torch.cuda.empty_cache()
# SummaryWriter: Log writer used for TensorboardX visualization.
    writer = SummaryWriter(config.log_path)
    # tqdm: A tool for drawing progress bars during training.
    early_stopping_count = 0

    logging.info("start training model {}, ".format(config.model_name) + \
        "epoch : {}, ".format(config.epochs) +
        "batch_size : {}, ".format(config.batch_size) +
        "num batches: {}, ".format(len(train_dataloader)))

    for epoch in range(start_epoch, config.epochs):
        batch_losses = []  # Get loss of each batch.
        num_batches = len(train_dataloader)
        #             with tqdm(total=num_batches//100) as batch_progress:
        for batch, data in enumerate(train_dataloader):
            x, y, x_len, y_len, oov, len_oovs, img_vec = data
            assert not np.any(np.isnan(x.numpy()))
            if config.is_cuda:  # Training with GPUs.
                x = x.to(DEVICE)
                y = y.to(DEVICE)
                x_len = x_len.to(DEVICE)
                len_oovs = len_oovs.to(DEVICE)
                img_vec = img_vec.to(DEVICE)
            if batch == 0:
                logging.info("x: %s, shape: %s" % (x, x.shape))
                logging.info("y: %s, shape: %s" % (y, y.shape))
                logging.info("oov: %s" % oov)
                logging.info("img_vec: %s, shape: %s" %
                             (img_vec, img_vec.shape))

            model.train()  # Sets the module in training mode.
            optimizer.zero_grad()  # Clear gradients.

            loss = model(x,
                         y,
                         len_oovs,
                         img_vec,
                         batch=batch,
                         num_batches=num_batches)
            batch_losses.append(loss.item())
            loss.backward()  # Backpropagation.

            # Do gradient clipping to prevent gradient explosion.
            clip_grad_norm_(model.encoder.parameters(), config.max_grad_norm)
            clip_grad_norm_(model.decoder.parameters(), config.max_grad_norm)
            clip_grad_norm_(model.attention.parameters(), config.max_grad_norm)
            clip_grad_norm_(model.reduce_state.parameters(),
                            config.max_grad_norm)
            optimizer.step()  # Update weights.
            #             scheduler.step()

            #                     # Output and record epoch loss every 100 batches.
            if (batch % 100) == 0:
                #                         batch_progress.set_description(f'Epoch {epoch}')
                #                         batch_progress.set_postfix(Batch=batch,
                #                                                    Loss=loss.item())
                #                         batch_progress.update()
                #                         # Write loss for tensorboard.
                writer.add_scalar(f'Average_loss_for_epoch_{epoch}',
                                  np.mean(batch_losses),
                                  global_step=batch)
                logging.info('epoch: {}, batch:{}, training loss:{}'.format(
                    epoch, batch, np.mean(batch_losses)))

        # Calculate average loss over all batches in an epoch.
        epoch_loss = np.mean(batch_losses)

        #             epoch_progress.set_description(f'Epoch {epoch}')
        #             epoch_progress.set_postfix(Loss=epoch_loss)
        #             epoch_progress.update()

        avg_val_loss = evaluate(model, val_data, epoch)

        logging.info('epoch: {} '.format(epoch) +
                     'training loss:{} '.format(epoch_loss) +
                     'validation loss:{} '.format(avg_val_loss))

        # Update minimum evaluating loss.
        if not os.path.exists(os.path.dirname(config.encoder_save_name)):
            os.mkdir(os.path.dirname(config.encoder_save_name))
        if (avg_val_loss < val_loss):
            logging.info("saving model to ../saved_model/ %s" %
                         config.model_name)
            torch.save(model.encoder, config.encoder_save_name)
            torch.save(model.decoder, config.decoder_save_name)
            torch.save(model.attention, config.attention_save_name)
            torch.save(model.reduce_state, config.reduce_state_save_name)
            val_loss = avg_val_loss
            with open(config.losses_path, 'a') as f:
                f.write(f"best val loss={val_loss}\n")
        else:
            early_stopping_count += 1
        if early_stopping_count >= config.patience:
            logging.info(
                f'Validation loss did not decrease for {config.patience} epochs, stop training.'
            )
            break

    writer.close()
Beispiel #11
0
        (val_extended_input_tokens, val_extended_gt_tokens, val_loss_mask,
         val_index)).batch(int(global_batch_size))
    val_dist_dataset = train_strategy.experimental_distribute_dataset(
        val_tf_dataset)

max_oovs_in_text = max(0,
                       np.max(extended_input_tokens) - vocab.size() + 1,
                       np.max(val_extended_input_tokens) - vocab.size() + 1)
print('Max oovs in text :', max_oovs_in_text)

#################################################################################################
# Создаем модель и слои ошибок, определяем функцию для распределенного обучения
#################################################################################################

with train_strategy.scope():
    model = PGN(vocab=vocab, max_oovs_in_text=max_oovs_in_text)
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001,
                                         beta_1=0.9,
                                         beta_2=0.98,
                                         epsilon=1e-9)
    ce_loss = CELoss(alpha=1.)


# def train_step(inputs):
def pretrain_step(extended_input_tokens, extended_gt_tokens, loss_mask, idx):
    model.switch_decoding_mode('cross_entropy')

    with tf.GradientTape() as tape:
        gt_probs, greedy_seqs, coverage_losses = model(extended_input_tokens,
                                                       extended_gt_tokens,
                                                       training=True)