def main():
    # representations = get_embeddings(device).to(device)
    # f=0

    # section: settings
    global best_bleu4, epochs_since_improvement, start_epoch, data_name, word_map

    # section: fine tune
    if args.fine_tune_encoder and args.fine_tune_epochs == -1:
        raise Exception(
            'if "fine_tune_encoder" == true you must also specify "fine_tune_epochs" != -1'
        )

    # section: word map
    if not args.run_local:
        data_f = '/yoav_stg/gshalev/image_captioning/output_folder'
    else:
        data_f = data_folder

    word_map_file = os.path.join(data_f, 'WORDMAP_' + data_name + '.json')
    print('word_map_file: {}'.format(word_map_file))

    print('loading word map from path: {}'.format(word_map_file))
    with open(word_map_file, 'r') as j:
        word_map = json.load(j)
    print('load word map COMPLETED')

    rev_word_map = {v: k for k, v in word_map.items()}

    # section: representation
    representations = get_embeddings(device).to(device)

    # section: not fixed
    if not args.fixed:
        representations.requires_grad = True

    # section: Initialization
    print('run a new model (No args.checkpoint)')
    decoder = DecoderWithoutAttention(attention_dim=300,
                                      embed_dim=300,
                                      decoder_dim=300,
                                      vocab_size=len(word_map),
                                      device=device,
                                      dropout=dropout,
                                      encoder_dim=300)

    decoder_optimizer = torch.optim.Adam(params=filter(
        lambda p: p.requires_grad, decoder.parameters()),
                                         lr=decoder_lr)
    # section: not fixed
    if not args.fixed:
        decoder_optimizer.add_param_group({'params': representations})

    encoder = Encoder(embeded_dim=300)
    #notice: fine to encoder
    encoder.fine_tune(True if args.fine_tune_encoder
                      and args.fine_tune_epochs == 0 else False)
    encoder_optimizer = torch.optim.Adam(
        params=filter(lambda p: p.requires_grad, encoder.parameters()),
        lr=encoder_lr
    ) if args.fine_tune_encoder and args.fine_tune_epochs == 0 else None

    # section: Move to GPU, if available
    decoder = decoder.to(device)
    encoder = encoder.to(device)

    # section: wandb
    if not args.run_local:
        wandb.watch(decoder)

    # section: Loss function
    criterion = nn.CrossEntropyLoss().to(device)

    # section: dataloaders
    train_loader = torch.utils.data.DataLoader(CaptionDataset(
        data_f,
        data_name,
        'TRAIN',
        transform=transforms.Compose([data_normalization])),
                                               batch_size=batch_size,
                                               shuffle=True,
                                               num_workers=workers,
                                               pin_memory=True)

    val_loader = torch.utils.data.DataLoader(CaptionDataset(
        data_f,
        data_name,
        'VAL',
        transform=transforms.Compose([data_normalization])),
                                             batch_size=batch_size,
                                             shuffle=True,
                                             num_workers=workers,
                                             pin_memory=True)

    val_loader_for_val = torch.utils.data.DataLoader(CaptionDataset(
        data_f,
        data_name,
        'VAL',
        transform=transforms.Compose([data_normalization])),
                                                     batch_size=1,
                                                     shuffle=True,
                                                     num_workers=workers,
                                                     pin_memory=True)

    # section: Epochs
    print('starting epochs')
    for epoch in range(start_epoch, epochs):

        # section: terminate training after 20 epochs without improvment
        if epochs_since_improvement == 20:
            print('break after : epochs_since_improvement == 20')
            break

        # section: fine tune encoder
        if epoch == args.fine_tune_epochs:
            print('fine tuning after epoch({}) == args.fine_tune_epochs({})'.
                  format(epoch, args.fine_tune_epochs))
            encoder.fine_tune(args.fine_tune_encoder)
            encoder_optimizer = torch.optim.Adam(params=filter(
                lambda p: p.requires_grad, encoder.parameters()),
                                                 lr=encoder_lr)

        # section: adjust LR after 8 epochs without improvment
        if epochs_since_improvement > 0 and epochs_since_improvement % 8 == 0:
            print('!!!  ADJUST LR AFTER : epochs_since_improvement: {}'.format(
                epochs_since_improvement))
            adjust_learning_rate(decoder_optimizer, 0.8)

        # section: train
        print(
            '--------------111111111-----------Start train----------epoch-{}'.
            format(epoch))
        train(train_loader=train_loader,
              encoder=encoder,
              decoder=decoder,
              criterion=criterion,
              encoder_optimizer=encoder_optimizer,
              decoder_optimizer=decoder_optimizer,
              epoch=epoch,
              representations=representations)

        # section: eval
        print(
            '--------------2222222222-----------Start validation----------epoch-{}'
            .format(epoch))
        recent_bleu4 = validate(val_loader=val_loader,
                                encoder=encoder,
                                decoder=decoder,
                                criterion=criterion,
                                rev_word_map=rev_word_map,
                                representations=representations)

        print('9999999999999- recent blue {}'.format(recent_bleu4))
        print(
            '--------------3333333333-----------Start val without teacher forcing----------epoch-{}'
            .format(epoch))
        with torch.no_grad():

            caption_image_beam_search(encoder, decoder, val_loader_for_val,
                                      word_map, rev_word_map, representations)
        print(
            '!@#!@!#!#@!#@!#@ DONE WITH TRAIN VAL AND VAL WITHOUT TEACHER FORCING FOR EPOCH :{}'
            .format(epoch))

        # section: save model if there was an improvement
        is_best = recent_bleu4 > best_bleu4
        best_bleu4 = max(recent_bleu4, best_bleu4)
        if not is_best:
            epochs_since_improvement += 1
            print("\nEpochs since last improvement: %d\n" %
                  (epochs_since_improvement, ))
        else:
            epochs_since_improvement = 0

        save_checkpoint(data_name,
                        epoch,
                        epochs_since_improvement,
                        encoder,
                        decoder,
                        encoder_optimizer,
                        decoder_optimizer,
                        recent_bleu4,
                        is_best,
                        representations=representations,
                        runname=args.runname)
Exemple #2
0
def train(config):
    wandb.init(project="lunar-lander", name="target")

    # EPISODE = 1_000
    STEP = 3_000_000
    EXPERIENCE_REPLAY = 1_000_000
    BATCH_SIZE = 32
    ENTROPY_TERM_COEFFICIENT = 0.2
    # ENTROPY_TERM_COEFFICIENT = 0.002
    GAMMA = 0.99
    POLYAK = 0.995
    LR = 0.001
    START_STEP = 10000

    env = gym.make(config.env)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]

    if not isinstance(env.action_space, gym.spaces.Box):
        raise RuntimeError("action space is not continuous")

    q = Q(state_dim, action_dim).cuda()
    q_opt = torch.optim.Adam(q.parameters(), LR)

    policy = Policy(state_dim, action_dim, env.action_space.low,
                    env.action_space.high).cuda()
    policy_opt = torch.optim.Adam(policy.parameters(), LR)

    wandb.watch([q1, q2, policy], log="all", log_freq=10000)

    replay_buffer = ReplayBuffer(EXPERIENCE_REPLAY, state_dim, action_dim)
    episode = 0
    step = 0

    while True:
        episode += 1
        state = env.reset()

        episode_reward = 0
        is_done = False

        while (not is_done) and step < STEP:
            step += 1
            # get action from policy net
            state_tensor = (torch.from_numpy(state).type(
                torch.FloatTensor).unsqueeze(0).cuda())
            with torch.no_grad():
                action = policy(state_tensor)[0].cpu().numpy()

            # take action
            next_state, reward, is_done, _info = env.step(action)
            bonus = 0
            reward += bonus

            # record
            replay_buffer.add(
                state=state,
                action=action,
                reward=reward,
                next_state=next_state,
                is_done=is_done,
            )
            if config.render:
                env.render()
            episode_reward += reward

            # clean up
            state = next_state
            del action, state_tensor, next_state, _info

            # train from replay
            if step < START_STEP:
                continue

            batch = replay_buffer.sample(BATCH_SIZE)

            # update Q
            # Q(s,a) = Q(next_state,a')

            with torch.no_grad():
                next_action, logp = policy(batch.next_states,
                                           with_logprob=True)

            target = batch.rewards + GAMMA * (1 - batch.is_dones) * (torch.min(
                q1_target(batch.next_states, next_action),
                q2_target(batch.next_states, next_action),
            ) - ENTROPY_TERM_COEFFICIENT * logp)
            del next_action, logp

            # Ex_a'[Q(s',a')] = Sum_a'(Q(s',a')*Pr[a'] - alpha * log(Pr[a'|s']))
            # Since a' is continuous,
            # We sample a single value of a' from policy and use it

            q_loss = F.mse_loss(q1(batch.states, batch.actions),
                                target) + F.mse_loss(
                                    q2(batch.states, batch.actions), target)
            q_opt.zero_grad()
            q_loss.backward()
            q_opt.step()
            del target

            # update Policy, ascend on Q+H
            action, logp = policy(
                batch.states,
                with_logprob=True)  # at this time, differentiable action
            policy_profit = (
                torch.min(q1(batch.states, action), q2(batch.states, action)) -
                ENTROPY_TERM_COEFFICIENT * logp)
            policy_loss = -policy_profit.mean()
            policy_opt.zero_grad()
            policy_loss.backward()
            policy_opt.step()

            # Update target networks by polyak averaging.
            with torch.no_grad():
                for p, p_targ in zip(q1.parameters(), q1_target.parameters()):
                    p_targ.data.mul_(POLYAK)
                    p_targ.data.add_((1 - POLYAK) * p.data)
                for p, p_targ in zip(q2.parameters(), q2_target.parameters()):
                    p_targ.data.mul_(POLYAK)
                    p_targ.data.add_((1 - POLYAK) * p.data)

            wandb.log(dict(reward=reward, q_loss=q_loss.item()), step=step)

            if step % 10000 == 0:
                torch.save(dict(policy=policy.state_dict()), "target.pt")
        print(episode, step, episode_reward)
        wandb.log(dict(episode_reward=episode_reward), step=step)
    def train(
        self,
        train_dataset,
        output_dir,
        show_running_loss=True,
        eval_data=None,
        verbose=True,
        **kwargs,
    ):
        """
        Trains the model on train_dataset.

        Utility function to be used by the train_model() method. Not intended to be used directly.
        """

        model = self.model
        args = self.args
        device = self.device

        tb_writer = SummaryWriter(logdir=args.tensorboard_dir)
        train_sampler = RandomSampler(train_dataset)
        train_dataloader = DataLoader(
            train_dataset,
            sampler=train_sampler,
            batch_size=args.train_batch_size,
            num_workers=self.args.dataloader_num_workers,
        )

        if args.max_steps > 0:
            t_total = args.max_steps
            args.num_train_epochs = args.max_steps // (
                len(train_dataloader) // args.gradient_accumulation_steps) + 1
        else:
            t_total = len(
                train_dataloader
            ) // args.gradient_accumulation_steps * args.num_train_epochs

        no_decay = ["bias", "LayerNorm.weight"]

        optimizer_grouped_parameters = []
        custom_parameter_names = set()
        for group in self.args.custom_parameter_groups:
            params = group.pop("params")
            custom_parameter_names.update(params)
            param_group = {**group}
            param_group["params"] = [
                p for n, p in model.named_parameters() if n in params
            ]
            optimizer_grouped_parameters.append(param_group)

        for group in self.args.custom_layer_parameters:
            layer_number = group.pop("layer")
            layer = f"layer.{layer_number}."
            group_d = {**group}
            group_nd = {**group}
            group_nd["weight_decay"] = 0.0
            params_d = []
            params_nd = []
            for n, p in model.named_parameters():
                if n not in custom_parameter_names and layer in n:
                    if any(nd in n for nd in no_decay):
                        params_nd.append(p)
                    else:
                        params_d.append(p)
                    custom_parameter_names.add(n)
            group_d["params"] = params_d
            group_nd["params"] = params_nd

            optimizer_grouped_parameters.append(group_d)
            optimizer_grouped_parameters.append(group_nd)

        if not self.args.train_custom_parameters_only:
            optimizer_grouped_parameters.extend([
                {
                    "params": [
                        p for n, p in model.named_parameters()
                        if n not in custom_parameter_names and not any(
                            nd in n for nd in no_decay)
                    ],
                    "weight_decay":
                    args.weight_decay,
                },
                {
                    "params": [
                        p for n, p in model.named_parameters()
                        if n not in custom_parameter_names and any(
                            nd in n for nd in no_decay)
                    ],
                    "weight_decay":
                    0.0,
                },
            ])

        warmup_steps = math.ceil(t_total * args.warmup_ratio)
        args.warmup_steps = warmup_steps if args.warmup_steps == 0 else args.warmup_steps

        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=args.warmup_steps,
            num_training_steps=t_total)

        if (args.model_name and os.path.isfile(
                os.path.join(args.model_name, "optimizer.pt"))
                and os.path.isfile(
                    os.path.join(args.model_name, "scheduler.pt"))):
            # Load in optimizer and scheduler states
            optimizer.load_state_dict(
                torch.load(os.path.join(args.model_name, "optimizer.pt")))
            scheduler.load_state_dict(
                torch.load(os.path.join(args.model_name, "scheduler.pt")))

        if args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        logger.info(" Training started")

        global_step = 0
        training_progress_scores = None
        tr_loss, logging_loss = 0.0, 0.0
        model.zero_grad()
        train_iterator = trange(int(args.num_train_epochs),
                                desc="Epoch",
                                disable=args.silent,
                                mininterval=0)
        epoch_number = 0
        best_eval_metric = None
        early_stopping_counter = 0
        steps_trained_in_current_epoch = 0
        epochs_trained = 0

        if args.model_name and os.path.exists(args.model_name):
            try:
                # set global_step to gobal_step of last saved checkpoint from model path
                checkpoint_suffix = args.model_name.split("/")[-1].split("-")
                if len(checkpoint_suffix) > 2:
                    checkpoint_suffix = checkpoint_suffix[1]
                else:
                    checkpoint_suffix = checkpoint_suffix[-1]
                global_step = int(checkpoint_suffix)
                epochs_trained = global_step // (
                    len(train_dataloader) // args.gradient_accumulation_steps)
                steps_trained_in_current_epoch = global_step % (
                    len(train_dataloader) // args.gradient_accumulation_steps)

                logger.info(
                    "   Continuing training from checkpoint, will skip to saved global_step"
                )
                logger.info("   Continuing training from epoch %d",
                            epochs_trained)
                logger.info("   Continuing training from global step %d",
                            global_step)
                logger.info(
                    "   Will skip the first %d steps in the current epoch",
                    steps_trained_in_current_epoch)
            except ValueError:
                logger.info("   Starting fine-tuning.")

        if args.evaluate_during_training:
            training_progress_scores = self._create_training_progress_scores(
                **kwargs)

        if args.wandb_project:
            wandb.init(project=args.wandb_project,
                       config={**asdict(args)},
                       **args.wandb_kwargs)
            wandb.watch(self.model)

        if args.fp16:
            from torch.cuda import amp

            scaler = amp.GradScaler()

        for current_epoch in train_iterator:
            model.train()
            if epochs_trained > 0:
                epochs_trained -= 1
                continue
            train_iterator.set_description(
                f"Epoch {epoch_number + 1} of {args.num_train_epochs}")
            batch_iterator = tqdm(
                train_dataloader,
                desc=f"Running Epoch {epoch_number} of {args.num_train_epochs}",
                disable=args.silent,
                mininterval=0,
            )
            for step, batch in enumerate(batch_iterator):
                if steps_trained_in_current_epoch > 0:
                    steps_trained_in_current_epoch -= 1
                    continue
                batch = tuple(t.to(device) for t in batch)

                inputs = self._get_inputs_dict(batch)
                if args.fp16:
                    with amp.autocast():
                        outputs = model(**inputs)
                        # model outputs are always tuple in pytorch-transformers (see doc)
                        loss = outputs[0]
                else:
                    outputs = model(**inputs)
                    # model outputs are always tuple in pytorch-transformers (see doc)
                    loss = outputs[0]

                if args.n_gpu > 1:
                    loss = loss.mean(
                    )  # mean() to average on multi-gpu parallel training

                current_loss = loss.item()

                if show_running_loss:
                    batch_iterator.set_description(
                        f"Epochs {epoch_number}/{args.num_train_epochs}. Running Loss: {current_loss:9.4f}"
                    )

                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    scaler.scale(loss).backward()
                else:
                    loss.backward()

                tr_loss += loss.item()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        scaler.unscale_(optimizer)
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                    if args.fp16:
                        scaler.step(optimizer)
                        scaler.update()
                    else:
                        optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1

                    if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                        # Log metrics
                        tb_writer.add_scalar("lr",
                                             scheduler.get_last_lr()[0],
                                             global_step)
                        tb_writer.add_scalar("loss", (tr_loss - logging_loss) /
                                             args.logging_steps, global_step)
                        logging_loss = tr_loss
                        if args.wandb_project:
                            wandb.log({
                                "Training loss": current_loss,
                                "lr": scheduler.get_last_lr()[0],
                                "global_step": global_step,
                            })

                    if args.save_steps > 0 and global_step % args.save_steps == 0:
                        # Save model checkpoint
                        output_dir_current = os.path.join(
                            output_dir, "checkpoint-{}".format(global_step))

                        self.save_model(output_dir_current,
                                        optimizer,
                                        scheduler,
                                        model=model)

                    if args.evaluate_during_training and (
                            args.evaluate_during_training_steps > 0
                            and global_step %
                            args.evaluate_during_training_steps == 0):
                        # Only evaluate when single GPU otherwise metrics may not average well
                        results = self.eval_model(
                            eval_data,
                            verbose=verbose
                            and args.evaluate_during_training_verbose,
                            silent=args.evaluate_during_training_silent,
                            **kwargs,
                        )
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value,
                                                 global_step)

                        output_dir_current = os.path.join(
                            output_dir, "checkpoint-{}".format(global_step))

                        if args.save_eval_checkpoints:
                            self.save_model(output_dir_current,
                                            optimizer,
                                            scheduler,
                                            model=model,
                                            results=results)

                        training_progress_scores["global_step"].append(
                            global_step)
                        training_progress_scores["train_loss"].append(
                            current_loss)
                        for key in results:
                            training_progress_scores[key].append(results[key])
                        report = pd.DataFrame(training_progress_scores)
                        report.to_csv(
                            os.path.join(args.output_dir,
                                         "training_progress_scores.csv"),
                            index=False,
                        )

                        if args.wandb_project:
                            wandb.log(
                                self._get_last_metrics(
                                    training_progress_scores))

                        if not best_eval_metric:
                            best_eval_metric = results[
                                args.early_stopping_metric]
                            self.save_model(args.best_model_dir,
                                            optimizer,
                                            scheduler,
                                            model=model,
                                            results=results)
                        if best_eval_metric and args.early_stopping_metric_minimize:
                            if results[
                                    args.
                                    early_stopping_metric] - best_eval_metric < args.early_stopping_delta:
                                best_eval_metric = results[
                                    args.early_stopping_metric]
                                self.save_model(args.best_model_dir,
                                                optimizer,
                                                scheduler,
                                                model=model,
                                                results=results)
                                early_stopping_counter = 0
                            else:
                                if args.use_early_stopping:
                                    if early_stopping_counter < args.early_stopping_patience:
                                        early_stopping_counter += 1
                                        if verbose:
                                            logger.info(
                                                f" No improvement in {args.early_stopping_metric}"
                                            )
                                            logger.info(
                                                f" Current step: {early_stopping_counter}"
                                            )
                                            logger.info(
                                                f" Early stopping patience: {args.early_stopping_patience}"
                                            )
                                    else:
                                        if verbose:
                                            logger.info(
                                                f" Patience of {args.early_stopping_patience} steps reached"
                                            )
                                            logger.info(
                                                " Training terminated.")
                                            train_iterator.close()
                                        return (
                                            global_step,
                                            tr_loss / global_step if not self.
                                            args.evaluate_during_training else
                                            training_progress_scores,
                                        )
                        else:
                            if results[
                                    args.
                                    early_stopping_metric] - best_eval_metric > args.early_stopping_delta:
                                best_eval_metric = results[
                                    args.early_stopping_metric]
                                self.save_model(args.best_model_dir,
                                                optimizer,
                                                scheduler,
                                                model=model,
                                                results=results)
                                early_stopping_counter = 0
                            else:
                                if args.use_early_stopping:
                                    if early_stopping_counter < args.early_stopping_patience:
                                        early_stopping_counter += 1
                                        if verbose:
                                            logger.info(
                                                f" No improvement in {args.early_stopping_metric}"
                                            )
                                            logger.info(
                                                f" Current step: {early_stopping_counter}"
                                            )
                                            logger.info(
                                                f" Early stopping patience: {args.early_stopping_patience}"
                                            )
                                    else:
                                        if verbose:
                                            logger.info(
                                                f" Patience of {args.early_stopping_patience} steps reached"
                                            )
                                            logger.info(
                                                " Training terminated.")
                                            train_iterator.close()
                                        return (
                                            global_step,
                                            tr_loss / global_step if not self.
                                            args.evaluate_during_training else
                                            training_progress_scores,
                                        )

            epoch_number += 1
            output_dir_current = os.path.join(
                output_dir,
                "checkpoint-{}-epoch-{}".format(global_step, epoch_number))

            if args.save_model_every_epoch or args.evaluate_during_training:
                os.makedirs(output_dir_current, exist_ok=True)

            if args.save_model_every_epoch:
                self.save_model(output_dir_current,
                                optimizer,
                                scheduler,
                                model=model)

            if args.evaluate_during_training and args.evaluate_each_epoch:
                results = self.eval_model(
                    eval_data,
                    verbose=verbose and args.evaluate_during_training_verbose,
                    silent=args.evaluate_during_training_silent,
                    **kwargs,
                )

                self.save_model(output_dir_current,
                                optimizer,
                                scheduler,
                                results=results)

                training_progress_scores["global_step"].append(global_step)
                training_progress_scores["train_loss"].append(current_loss)
                for key in results:
                    training_progress_scores[key].append(results[key])
                report = pd.DataFrame(training_progress_scores)
                report.to_csv(os.path.join(args.output_dir,
                                           "training_progress_scores.csv"),
                              index=False)

                if args.wandb_project:
                    wandb.log(self._get_last_metrics(training_progress_scores))

                if not best_eval_metric:
                    best_eval_metric = results[args.early_stopping_metric]
                    self.save_model(args.best_model_dir,
                                    optimizer,
                                    scheduler,
                                    model=model,
                                    results=results)
                if best_eval_metric and args.early_stopping_metric_minimize:
                    if results[
                            args.
                            early_stopping_metric] - best_eval_metric < args.early_stopping_delta:
                        best_eval_metric = results[args.early_stopping_metric]
                        self.save_model(args.best_model_dir,
                                        optimizer,
                                        scheduler,
                                        model=model,
                                        results=results)
                        early_stopping_counter = 0
                    else:
                        if args.use_early_stopping and args.early_stopping_consider_epochs:
                            if early_stopping_counter < args.early_stopping_patience:
                                early_stopping_counter += 1
                                if verbose:
                                    logger.info(
                                        f" No improvement in {args.early_stopping_metric}"
                                    )
                                    logger.info(
                                        f" Current step: {early_stopping_counter}"
                                    )
                                    logger.info(
                                        f" Early stopping patience: {args.early_stopping_patience}"
                                    )
                            else:
                                if verbose:
                                    logger.info(
                                        f" Patience of {args.early_stopping_patience} steps reached"
                                    )
                                    logger.info(" Training terminated.")
                                    train_iterator.close()
                                return (
                                    global_step,
                                    tr_loss / global_step
                                    if not self.args.evaluate_during_training
                                    else training_progress_scores,
                                )
                else:
                    if results[
                            args.
                            early_stopping_metric] - best_eval_metric > args.early_stopping_delta:
                        best_eval_metric = results[args.early_stopping_metric]
                        self.save_model(args.best_model_dir,
                                        optimizer,
                                        scheduler,
                                        model=model,
                                        results=results)
                        early_stopping_counter = 0
                    else:
                        if args.use_early_stopping and args.early_stopping_consider_epochs:
                            if early_stopping_counter < args.early_stopping_patience:
                                early_stopping_counter += 1
                                if verbose:
                                    logger.info(
                                        f" No improvement in {args.early_stopping_metric}"
                                    )
                                    logger.info(
                                        f" Current step: {early_stopping_counter}"
                                    )
                                    logger.info(
                                        f" Early stopping patience: {args.early_stopping_patience}"
                                    )
                            else:
                                if verbose:
                                    logger.info(
                                        f" Patience of {args.early_stopping_patience} steps reached"
                                    )
                                    logger.info(" Training terminated.")
                                    train_iterator.close()
                                return (
                                    global_step,
                                    tr_loss / global_step
                                    if not self.args.evaluate_during_training
                                    else training_progress_scores,
                                )

        return (
            global_step,
            tr_loss / global_step if not self.args.evaluate_during_training
            else training_progress_scores,
        )
Exemple #4
0
config = wandb.config
config.dropout = 1.0

# writer = SummaryWriter()
env = SingleObservation()
n_actions = env.action_space.shape[-1]
action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                 sigma=0.1 * np.ones(n_actions))
model = TD3("CnnPolicy",
            env,
            action_noise=action_noise,
            verbose=1,
            buffer_size=10000,
            tensorboard_log="./td3_learning_tensorboard")
model.learn(total_timesteps=10)
wandb.watch(model)

obs = env.reset()
for _ in range(10):
    print("running")
    action, _states = model.predict(obs)
    # action = env.action_space.sample()
    for t in range(10000):
        obs, rewards, done, info = env.step(action)
        if done:
            print("Episode finished after {} timesteps".format(t + 1))
            obs = env.reset()
            break

env.render()
# writer.flush()
Exemple #5
0
     if name.split('.')[1].isdigit():
         if int(name.split('.')[1]) > wbconfig.training_depth:
             param.requires_grad = True
 models[cv]._conv_head.weight.requires_grad = True
 models[cv]._bn1.weight.requires_grad = True
 models[cv]._bn1.bias.requires_grad = True
 models[cv]._fc.weight.requires_grad = True
 models[cv]._fc.bias.requires_grad = True
 # collect parameters to be trained
 params_to_update = []
 for name,param in models[cv].named_parameters():
     if param.requires_grad:
         params_to_update.append(param)
 
 '''Step 3: Build required stuff for training helper function'''
 wandb.watch(models[cv],log='all')
 models[cv].to(device)
 dataloaders = dataloaders_dict
 criterion = nn.CrossEntropyLoss(weight = class_weights).to(device)
 optimizer = optim.Adam(params_to_update,
                       lr = wbconfig.learning_rate,
                       betas=(wbconfig.betas1,wbconfig.betas2),
                       eps=wbconfig.eps,
                       amsgrad=wbconfig.amsgrad)
 num_epochs = wbconfig.num_epochs
 log_path = './MURA_Anim_Finetune/log/CV'+str(cv)+'_log.txt'
 model_save_path = './MURA_Anim_Finetune/CV'+str(cv)
 
 '''Step 4: Train model'''
 trained_model,val_acc_history,train_acc_history  = train_model(models[cv],
                                                                dataloaders_dict, 
def main():
    best_accu = 0

    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--batch_size',
                        type=int,
                        default=64,
                        metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--epochs',
                        type=int,
                        default=14,
                        metavar='N',
                        help='number of epochs to train (default: 14)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.8,
                        metavar='LR',
                        help='learning rate (default: 1.0)')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.7,
                        metavar='M',
                        help='Learning rate step gamma (default: 0.7)')
    parser.add_argument('--wandb',
                        action='store_true',
                        default=False,
                        help='For wandb logging')
    parser.add_argument('--train',
                        action='store_false',
                        default=True,
                        help='Start training')
    parser.add_argument('--val',
                        action='store_false',
                        default=True,
                        help='Start validation')
    parser.add_argument('--test',
                        action='store_false',
                        default=True,
                        help='Start testing on MNIST test set')
    parser.add_argument('--per_class',
                        action='store_false',
                        default=True,
                        help='Calulate accuracy per class')
    parser.add_argument('--saved_ckpt',
                        type=str,
                        default="./checkpoints",
                        metavar='saved_ckpt',
                        help='Path for saving the checkpoint')
    parser.add_argument('--load_ckpt',
                        type=str,
                        default="./checkpoints",
                        metavar='load_ckpt',
                        help='For loading checkpoint')
    parser.add_argument(
        '--path',
        type=str,
        default="./data",
        metavar='path',
        help='For Training the model on midas task 1 split set')

    args = parser.parse_args()

    if args.wandb:
        # wandb initalization
        print("==> wandb initalization of project")
        wandb.init(project="midas-tasks-solutions", reinit=True)
        wandb.config.update(args)

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    os.makedirs(f'{args.saved_ckpt}', exist_ok=True)
    fdir = f'{args.saved_ckpt}/run_with_epochs_{args.epochs}_LR_{args.lr}'
    args.load_ckpt = fdir
    os.makedirs(fdir, exist_ok=True)

    print("==> Loading dataset")
    train_kwargs = {'batch_size': args.batch_size}
    val_kwargs = {'batch_size': args.batch_size}
    test_kwargs = {'batch_size': args.batch_size}

    if torch.cuda.is_available():
        cuda_kwargs = {'num_workers': 1, 'pin_memory': True, 'shuffle': True}
        train_kwargs.update(cuda_kwargs)

    print("==> Loading Midas dataset")
    midas_train, midas_val = midas_task1_split(args.path)

    print("==> Loading MNIST dataset")
    mnist_test = mnist_testloader()

    midas_train_loader = torch.utils.data.DataLoader(midas_train,
                                                     **train_kwargs)
    midas_val_loader = torch.utils.data.DataLoader(midas_val, **val_kwargs)
    test_loader = torch.utils.data.DataLoader(mnist_test, **test_kwargs)

    print("==> Building model...")
    midas_model = Net().to(device)
    print(midas_model)
    mnist_model = Net().to(device)

    optimizer = optim.Adadelta(midas_model.parameters(), lr=args.lr)
    criterion = nn.CrossEntropyLoss()

    scheduler = CosineAnnealingWarmupRestarts(optimizer,
                                              first_cycle_steps=args.epochs,
                                              cycle_mult=1.0,
                                              max_lr=1.0,
                                              min_lr=args.lr,
                                              warmup_steps=5,
                                              gamma=1.0)
    if args.wandb:
        wandb.watch(midas_model)
        wandb.watch(mnist_model)

    print(f"==> Starting Learning Rate {args.lr}")
    for epoch in range(1, args.epochs + 1):
        print(f"==> Epoch {epoch}/{args.epochs + 1}")

        if args.train:
            print("==> Model training started")
            train(args, midas_model, device, midas_train_loader, optimizer,
                  epoch, criterion)

        if args.val:
            print("==> Evaluating midas model on midas val")
            midas_accu = val_midas(args, midas_model, device, midas_val_loader,
                                   epoch, criterion)

            print(f"==> Saving model checkpoint at {fdir}")
            is_best = midas_accu > best_accu
            best_accu = max(midas_accu, best_accu)
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': midas_model.state_dict(),
                    'best_accu': best_accu,
                    'optimizer': optimizer.state_dict(),
                }, is_best, fdir)

        if args.test:
            print("==> Loading model checkpoint")
            load_ckpt(mnist_model, args.load_ckpt)
            mnist_model.fc2.out_features = 10
            print(mnist_model)

            print("==> Testing model on mnist")
            mnist_accu = test(args, mnist_model, device, test_loader, epoch,
                              criterion)

        if args.per_class:
            print("==> Accuracy per class on midas val")
            accuracy_per_class(args, midas_model, device, midas_val_loader,
                               epoch, midas_val.classes)

        scheduler.step()
        print("Lr after scheduler = ", optimizer.param_groups[0]['lr'])

    print(f"Best accuracy on testing set = {best_accu}")
Exemple #7
0
    drop_last=False,
    collate_fn=data_maker.generate_batch,
)

# 加载模型
encoder = BertModel.from_pretrained(config["bert_path"])
hidden_size = encoder.config.hidden_size

ent_extractor = TPLinkerPlusBert(encoder, tag_size,
                                 hyper_parameters["shaking_type"],
                                 hyper_parameters["inner_enc_type"],
                                 hyper_parameters["tok_pair_sample_rate"])

ent_extractor = ent_extractor.to(device)
if config["logger"] == "wandb":
    wandb.watch(ent_extractor)

# 加载损失函数
metrics = MetricsCalculator(handshaking_tagger)
loss_func = lambda y_pred, y_true: metrics.loss_func(
    y_pred, y_true, ghm=hyper_parameters["ghm"])


# train step
def train_step(batch_train_data, optimizer):
    sample_list, batch_input_ids, \
    batch_attention_mask, batch_token_type_ids, \
    tok2char_span_list, batch_shaking_tag = batch_train_data

    batch_input_ids, \
    batch_attention_mask, \
    def train(self,
              output_dir,
              train_batch_size,
              gradient_accumulation_steps,
              seed,
              epochs,
              data_path,
              pretrained_path,
              valid_path=None,
              no_cuda=False,
              dropout=0.3,
              weight_decay=0.01,
              warmup_proportion=0.1,
              learning_rate=5e-5,
              adam_epsilon=1e-8,
              max_seq_length=128,
              squeeze=True,
              max_grad_norm=1.0,
              eval_batch_size=32,
              epoch_save_model=False,
              model_name='XLMR',
              embedding_path=None,
              split_train_data=False,
              data_divider=0.6,
              wandb=None,
              save=True,
              logger=None,
              json_dataset=False,
              label_file=None,
              xlm_dataset=False,
              div=None,
              div_2=None,
              motherfile=False,
              multi_source_labels=False,
              device=0):
        epoch_times = []
        if wandb:
            import wandb
            print(wandb)
            wandb.init(project='ABOM-PolEmo',
                       config={
                           "epochs": epochs,
                           "language_model": pretrained_path,
                           "batch_size": train_batch_size,
                           "max_seq_length": max_seq_length,
                           "warmup_proportion": warmup_proportion,
                           "learning_rate": learning_rate,
                           "gradient_accumulation_steps":
                           gradient_accumulation_steps,
                           "squeeze": squeeze,
                           "dropout": dropout,
                           "output_dit": output_dir
                       })
        if save and os.path.exists(output_dir) and os.listdir(output_dir):
            raise ValueError(
                "Output directory (%s) already exists and is not empty." %
                output_dir)

        if save and not os.path.exists(output_dir):
            os.makedirs(output_dir)
        if not logger:
            logging.basicConfig(
                format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                datefmt='%m/%d/%Y %H:%M:%S',
                level=logging.INFO,
                filename=os.path.join(output_dir, "log.txt"))
            logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))

            logger = logging.getLogger(__name__)

        if gradient_accumulation_steps < 1:
            raise ValueError(
                "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
                % gradient_accumulation_steps)

        train_batch_size = train_batch_size // gradient_accumulation_steps

        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)

        if split_train_data:
            if json_dataset:
                examples, label_list = get_examples_from_json(data_path)
            elif motherfile:
                examples, label_list = get_examples_from_motherfile(data_path)
            elif xlm_dataset:
                examples, label_list = get_examples_from_xml(data_path)
            else:
                examples, label_list = get_examples(data_path, 'train')
            random.shuffle(examples)
            train_examples = examples[0:int(len(examples) * data_divider)]
            val_examples = examples[int(len(examples) * data_divider):]
            eval_examples = examples[(
                int(len(examples) * data_divider) +
                int(len(examples) * ((1 - data_divider) / 2))):]
        else:
            train_examples = None
            if json_dataset:
                examples, label_list = get_examples_from_json(data_path)
            elif motherfile:
                train_examples, train_label_list = get_examples_from_motherfile(
                    data_path, 'train')
                val_examples, val_label_list = get_examples_from_motherfile(
                    data_path, 'test')
                train_label_list.extend(val_label_list)
                label_list = list(set(train_label_list))
            elif xlm_dataset:
                examples, label_list = get_examples_from_xml(data_path)
            else:
                train_examples, label_list = get_examples(data_path, 'train')
        logger.info("\nDATA SIZE\n")
        logger.info("\Train  = %d\n" % len(train_examples))
        logger.info("\Val  = %d\n" % len(val_examples))

        num_train_optimization_steps = 0
        num_labels = len(label_list) + 1
        num_train_optimization_steps = int(
            len(train_examples) / train_batch_size /
            gradient_accumulation_steps) * epochs

        hidden_size = 300 if pretrained_path == None else 768 if 'base' in pretrained_path else 1024
        device = 'cuda:0' if (torch.cuda.is_available()
                              and not no_cuda) else 'cpu'
        logger.info(device)
        if model_name == 'HERBERT':
            model = AutoTokenizerForTokenClassification(
                pretrained_path=pretrained_path,
                n_labels=num_labels,
                hidden_size=hidden_size,
                dropout_p=dropout,
                device=device)
        elif model_name == 'BERT_MULTILINGUAL':
            model = BertBaseMultilingualCased(pretrained_path=pretrained_path,
                                              n_labels=num_labels,
                                              hidden_size=hidden_size,
                                              dropout_p=dropout,
                                              device=device)
        elif model_name == 'REFORMER':
            model = Reformer(n_labels=num_labels,
                             hidden_size=512,
                             dropout=dropout,
                             device=device,
                             max_seq_length=max_seq_length,
                             batch_size=train_batch_size)
        elif model_name == 'POLISH_ROBERTA':
            model = PolishRoberta(pretrained_path=pretrained_path,
                                  n_labels=num_labels,
                                  hidden_size=hidden_size,
                                  dropout_p=dropout,
                                  device=device)
        else:
            model = XLMRForTokenClassification(pretrained_path=pretrained_path,
                                               n_labels=num_labels,
                                               hidden_size=hidden_size,
                                               dropout=dropout,
                                               device=device)

        model.to(device)
        if wandb:
            wandb.watch(model)
        no_decay = ['bias', 'final_layer_norm.weight']

        params = list(model.named_parameters())

        optimizer_grouped_parameters = [{
            'params':
            [p for n, p in params if not any(nd in n for nd in no_decay)],
            'weight_decay':
            weight_decay
        }, {
            'params':
            [p for n, p in params if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        warmup_steps = int(warmup_proportion * num_train_optimization_steps)
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=learning_rate,
                          eps=adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=warmup_steps,
                                         t_total=num_train_optimization_steps)

        train_features = convert_examples_to_features(train_examples,
                                                      label_list,
                                                      max_seq_length,
                                                      model.encode_word)

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        train_data = create_dataset(train_features)
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=train_batch_size)
        if not split_train_data and not val_examples and not motherfile:
            val_examples, _ = get_examples(valid_path, 'valid')
        val_features = convert_examples_to_features(val_examples, label_list,
                                                    max_seq_length,
                                                    model.encode_word)

        val_data = create_dataset(val_features)

        best_val_f1 = 0.0
        best_precision = 0.0
        best_recall = 0.0
        for epoch_no in range(1, epochs + 1):
            start = timer()
            epoch_stats = {"epoch": epoch_no}
            logger.info("Epoch %d" % epoch_no)
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0

            model.train()
            steps = len(train_dataloader)
            for step, batch in enumerate(train_dataloader):
                batch = tuple(t.to(device) for t in batch)
                input_ids, label_ids, l_mask, valid_ids, = batch
                loss = model(input_ids, label_ids, l_mask, valid_ids)
                if gradient_accumulation_steps > 1:
                    loss = loss / gradient_accumulation_steps

                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               max_grad_norm)
                tr_loss += loss.item()
                epoch_stats["loss"] = loss
                if wandb:
                    wandb.log({"loss": loss})
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if step % 5 == 0:
                    logger.info('Step = %d/%d; Loss = %.4f' %
                                (step + 1, steps, tr_loss / (step + 1)))
                if (step + 1) % gradient_accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()
                    model.zero_grad()
                del batch
            logger.info("\nTesting on validation set...")
            f1, report, entity_scores, precision, recall = evaluate_model(
                model, val_data, label_list, eval_batch_size, device)
            epoch_stats["validation_F1"] = f1
            print(report)
            if f1 > best_val_f1:
                best_val_f1 = f1
                best_precision = precision
                best_recall = recall
                logger.info(
                    "\nFound better f1=%.4f on validation set. Saving model\n"
                    % f1)
                logger.info("%s\n" % report)
                if save:
                    torch.save(
                        model.state_dict(),
                        open(os.path.join(output_dir, 'model.pt'), 'wb'))
                    save_params(output_dir, dropout, num_labels, label_list)

            if save and epoch_save_model:
                epoch_output_dir = os.path.join(output_dir, "e%03d" % epoch_no)
                os.makedirs(epoch_output_dir)
                if save:
                    torch.save(
                        model.state_dict(),
                        open(os.path.join(epoch_output_dir, 'model.pt'), 'wb'))
                    save_params(epoch_output_dir, dropout, num_labels,
                                label_list)
            if wandb:
                wandb.log(epoch_stats)
            epoch_times.append(timer() - start)
        model.cpu()
        del model, logger
        torch.cuda.empty_cache()
        print("Avg. epoch time")
        print(np.mean(epoch_times, axis=0))
        print(max_seq_length)
        return best_val_f1, entity_scores, best_precision, epoch_times, best_recall
Exemple #9
0
    def train(
        self,
        train_dataset,
        output_dir,
        multi_label=False,
        show_running_loss=True,
        eval_df=None,
        test_df=None,
        **kwargs,
    ):
        """
        Trains the model on train_dataset.

        Utility function to be used by the train_model() method. Not intended to be used directly.
        """

        device = self.device
        model = self.model
        args = self.args

        tb_writer = SummaryWriter(logdir=args["tensorboard_dir"])
        train_sampler = RandomSampler(train_dataset)
        train_dataloader = DataLoader(train_dataset,
                                      sampler=train_sampler,
                                      batch_size=args["train_batch_size"])

        t_total = (len(train_dataloader) //
                   args["gradient_accumulation_steps"] *
                   args["num_train_epochs"])

        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                args["weight_decay"],
            },
            {
                "params": [
                    p for n, p in model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0,
            },
        ]

        warmup_steps = math.ceil(t_total * args["warmup_ratio"])
        args["warmup_steps"] = (warmup_steps if args["warmup_steps"] == 0 else
                                args["warmup_steps"])

        optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=args["learning_rate"],
            eps=args["adam_epsilon"],
        )
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=args["warmup_steps"],
            num_training_steps=t_total)

        if args["fp16"]:
            try:
                from apex import amp
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
                )

            model, optimizer = amp.initialize(model,
                                              optimizer,
                                              opt_level=args["fp16_opt_level"])

        if args["n_gpu"] > 1:
            model = torch.nn.DataParallel(model)

        global_step = 0
        tr_loss, logging_loss = 0.0, 0.0
        model.zero_grad()
        train_iterator = trange(int(args["num_train_epochs"]),
                                desc="Epoch",
                                disable=args["silent"])
        epoch_number = 0
        if args["evaluate_during_training"]:
            extra_metrics = {key: [] for key in kwargs}
            if multi_label:
                training_progress_scores = {
                    "global_step": [],
                    "LRAP": [],
                    "train_loss": [],
                    "eval_loss": [],
                    **extra_metrics,
                }
            else:
                if self.model.num_labels == 2:
                    training_progress_scores = {
                        "global_step": [],
                        "tp": [],
                        "tn": [],
                        "fp": [],
                        "fn": [],
                        "mcc": [],
                        "train_loss": [],
                        "eval_loss": [],
                        **extra_metrics,
                    }
                elif self.model.num_labels == 1:
                    training_progress_scores = {
                        "global_step": [],
                        "train_loss": [],
                        "eval_loss": [],
                        **extra_metrics,
                    }
                else:
                    training_progress_scores = {
                        "global_step": [],
                        "mcc": [],
                        "train_loss": [],
                        "eval_loss": [],
                        **extra_metrics,
                    }

        if args["wandb_project"]:
            wandb.init(project=args["wandb_project"],
                       config={**args},
                       **args["wandb_kwargs"])
            wandb.watch(self.model)

        if args["faq_evaluate_during_training"]:
            write_progress_to_csv(output_dir,
                                  'train_log.csv',
                                  write_header=True)

        model.train()
        for _ in train_iterator:
            train_start = time.time()
            # epoch_iterator = tqdm(train_dataloader, desc="Iteration")
            for step, batch in enumerate(
                    tqdm(train_dataloader,
                         desc="Current iteration",
                         disable=args["silent"])):
                batch = tuple(t.to(device) for t in batch)

                inputs = self._get_inputs_dict(batch)
                outputs = model(**inputs)
                # model outputs are always tuple in pytorch-transformers (see doc)
                loss = outputs[0]

                if args["n_gpu"] > 1:
                    loss = (
                        loss.mean()
                    )  # mean() to average on multi-gpu parallel training

                current_loss = loss.item()

                if show_running_loss:
                    print("\rRunning loss: %f" % loss, end="")

                if args["gradient_accumulation_steps"] > 1:
                    loss = loss / args["gradient_accumulation_steps"]

                if args["fp16"]:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args["max_grad_norm"])
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args["max_grad_norm"])

                tr_loss += loss.item()
                if (step + 1) % args["gradient_accumulation_steps"] == 0:
                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1

                    if (args["logging_steps"] > 0
                            and global_step % args["logging_steps"] == 0):
                        # Log metrics
                        tb_writer.add_scalar("lr",
                                             scheduler.get_lr()[0],
                                             global_step)
                        tb_writer.add_scalar(
                            "loss",
                            (tr_loss - logging_loss) / args["logging_steps"],
                            global_step,
                        )
                        logging_loss = tr_loss
                        if args["wandb_project"]:
                            wandb.log({
                                "Training loss": current_loss,
                                "lr": scheduler.get_lr()[0],
                                "global_step": global_step,
                            })

                    if args["save_steps"] > 0 and global_step % args[
                            "save_steps"] == 0:
                        # Save model checkpoint
                        output_dir_current = os.path.join(
                            output_dir, "checkpoint-{}".format(global_step))

                        if not os.path.exists(output_dir_current):
                            os.makedirs(output_dir_current)

                        # Take care of distributed/parallel training
                        model_to_save = (model.module if hasattr(
                            model, "module") else model)
                        model_to_save.save_pretrained(output_dir_current)
                        self.tokenizer.save_pretrained(output_dir_current)

                    if args["evaluate_during_training"] and (
                            args["evaluate_during_training_steps"] > 0
                            and global_step %
                            args["evaluate_during_training_steps"] == 0):
                        # Only evaluate when single GPU otherwise metrics may not average well
                        results, _, _ = self.eval_model(eval_df,
                                                        verbose=True,
                                                        **kwargs)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value,
                                                 global_step)

                        output_dir_current = os.path.join(
                            output_dir, "checkpoint-{}".format(global_step))

                        if not os.path.exists(output_dir_current):
                            os.makedirs(output_dir_current)

                        if args["save_eval_checkpoints"]:
                            model_to_save = (model.module if hasattr(
                                model, "module") else model)
                            model_to_save.save_pretrained(output_dir_current)
                            self.tokenizer.save_pretrained(output_dir_current)

                        output_eval_file = os.path.join(
                            output_dir_current, "eval_results.txt")
                        with open(output_eval_file, "w") as writer:
                            for key in sorted(results.keys()):
                                writer.write("{} = {}\n".format(
                                    key, str(results[key])))

                        training_progress_scores["global_step"].append(
                            global_step)
                        training_progress_scores["train_loss"].append(
                            current_loss)
                        for key in results:
                            training_progress_scores[key].append(results[key])
                        report = pd.DataFrame(training_progress_scores)
                        report.to_csv(
                            args["output_dir"] +
                            "training_progress_scores.csv",
                            index=False,
                        )

                        if args["wandb_project"]:
                            wandb.log(
                                self._get_last_metrics(
                                    training_progress_scores))

            epoch_number += 1
            train_time = datetime.timedelta(seconds=int(time.time() -
                                                        train_start))

            save_start = time.time()
            output_dir_current = os.path.join(
                output_dir,
                "checkpoint-{}-epoch-{}".format(global_step, epoch_number))
            if (args["save_model_every_epoch"]
                    or args["evaluate_during_training"]
                ) and not os.path.exists(output_dir_current):
                os.makedirs(output_dir_current)

            if args["save_model_every_epoch"]:

                model_to_save = model.module if hasattr(model,
                                                        "module") else model
                model_to_save.save_pretrained(output_dir_current)
                self.tokenizer.save_pretrained(output_dir_current)
            save_time = datetime.timedelta(seconds=int(time.time() -
                                                       save_start))

            eval_start = time.time()
            if args["evaluate_during_training"]:
                results, _, _ = self.eval_model(eval_df,
                                                verbose=True,
                                                **kwargs)

                output_eval_file = os.path.join(output_dir_current,
                                                "eval_results.txt")
                with open(output_eval_file, "w") as writer:
                    for key in sorted(results.keys()):
                        writer.write("{} = {}\n".format(
                            key, str(results[key])))

                training_progress_scores["global_step"].append(global_step)
                training_progress_scores["train_loss"].append(current_loss)
                for key in results:
                    training_progress_scores[key].append(results[key])
                report = pd.DataFrame(training_progress_scores)
                report.to_csv(args["output_dir"] +
                              "training_progress_scores.csv",
                              index=False)
            if args["faq_evaluate_during_training"]:
                records = {
                    'epoch':
                    epoch_number,
                    'ckpt':
                    "checkpoint-{}-epoch-{}".format(global_step, epoch_number)
                }
                if eval_df is not None:
                    eval_metrics, _, _ = faq_evaluate(self, eval_df)
                    print_metrics(eval_metrics)
                    records.update({('dev-' + k): v
                                    for k, v in eval_metrics.items()})
                if test_df is not None:
                    test_metrics, _, _ = faq_evaluate(self, test_df)
                    print_metrics(test_metrics)
                    records.update({('test-' + k): v
                                    for k, v in test_metrics.items()})
                write_progress_to_csv(output_dir,
                                      'train_log.csv',
                                      metrics=records)

            eval_time = datetime.timedelta(seconds=int(time.time() -
                                                       eval_start))

            print(
                f'Finished epoch {epoch_number} [train {train_time}, save {save_time}, eval {eval_time}]'
            )

        return global_step, tr_loss / global_step
Exemple #10
0
 def watch(self, model):
     wandb.watch(model)
Exemple #11
0
def train_and_eval(args, recon_args, recon_model):
    """
    Wrapper for training and evaluation of policy model.

    :param args: Argument object, containing hyperparameters for training and evaluation.
    :param recon_args: reconstruction model arguments.
    :param recon_model: reconstruction model.
    """
    if args.resume:
        # Check that this works
        resumed = True
        new_run_dir = args.policy_model_checkpoint.parent
        data_path = args.data_path
        # In case models have been moved to a different machine, make sure the path to the recon model is the
        # path provided.
        recon_model_checkpoint = args.recon_model_checkpoint

        model, args, start_epoch, optimiser = load_policy_model(pathlib.Path(
            args.policy_model_checkpoint),
                                                                optim=True)

        args.old_run_dir = args.run_dir
        args.old_recon_model_checkpoint = args.recon_model_checkpoint
        args.old_data_path = args.data_path

        args.recon_model_checkpoint = recon_model_checkpoint
        args.run_dir = new_run_dir
        args.data_path = data_path
        args.resume = True
    else:
        resumed = False
        # Improvement model to train
        model = build_policy_model(args)
        # Add mask parameters for training
        args = add_mask_params(args)
        if args.data_parallel:
            model = torch.nn.DataParallel(model)
        optimiser = build_optim(args, model.parameters())
        start_epoch = 0
        # Create directory to store results in
        savestr = '{}_res{}_al{}_accel{}_k{}_{}_{}'.format(
            args.dataset, args.resolution, args.acquisition_steps,
            args.accelerations, args.num_trajectories,
            datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S"),
            ''.join(choice(ascii_uppercase) for _ in range(5)))
        args.run_dir = args.exp_dir / savestr
        args.run_dir.mkdir(parents=True, exist_ok=False)

    args.resumed = resumed

    if args.wandb:
        allow_val_change = args.resumed  # only allow changes if resumed: otherwise something is wrong.
        wandb.config.update(args, allow_val_change=allow_val_change)
        wandb.watch(model, log='all')

    # Logging
    logging.info(recon_model)
    logging.info(model)
    # Save arguments for bookkeeping
    args_dict = {
        key: str(value)
        for key, value in args.__dict__.items()
        if not key.startswith('__') and not callable(key)
    }
    save_json(args.run_dir / 'args.json', args_dict)

    # Initialise summary writer
    writer = SummaryWriter(log_dir=args.run_dir / 'summary')

    # Parameter counting
    logging.info(
        'Reconstruction model parameters: total {}, of which {} trainable and {} untrainable'
        .format(count_parameters(recon_model),
                count_trainable_parameters(recon_model),
                count_untrainable_parameters(recon_model)))
    logging.info(
        'Policy model parameters: total {}, of which {} trainable and {} untrainable'
        .format(count_parameters(model), count_trainable_parameters(model),
                count_untrainable_parameters(model)))

    if args.scheduler_type == 'step':
        scheduler = torch.optim.lr_scheduler.StepLR(optimiser,
                                                    args.lr_step_size,
                                                    args.lr_gamma)
    elif args.scheduler_type == 'multistep':
        if not isinstance(args.lr_multi_step_size, list):
            args.lr_multi_step_size = [args.lr_multi_step_size]
        scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimiser, args.lr_multi_step_size, args.lr_gamma)
    else:
        raise ValueError(
            "{} is not a valid scheduler choice ('step', 'multistep')".format(
                args.scheduler_type))

    # Create data loaders
    train_loader = create_data_loader(args, 'train', shuffle=True)
    dev_loader = create_data_loader(args, 'val', shuffle=False)

    train_data_range_dict = create_data_range_dict(args, train_loader)
    dev_data_range_dict = create_data_range_dict(args, dev_loader)

    if not args.resume:
        if args.do_train_ssim:
            do_and_log_evaluation(args, -1, recon_model, model, train_loader,
                                  writer, 'Train', train_data_range_dict)
        do_and_log_evaluation(args, -1, recon_model, model, dev_loader, writer,
                              'Val', dev_data_range_dict)

    for epoch in range(start_epoch, args.num_epochs):
        train_loss, train_time = train_epoch(args, epoch, recon_model, model,
                                             train_loader, optimiser, writer,
                                             train_data_range_dict)
        logging.info(
            f'Epoch = [{epoch+1:3d}/{args.num_epochs:3d}] TrainLoss = {train_loss:.3g} TrainTime = {train_time:.2f}s '
        )

        if args.do_train_ssim:
            do_and_log_evaluation(args, epoch, recon_model, model,
                                  train_loader, writer, 'Train',
                                  train_data_range_dict)
        do_and_log_evaluation(args, epoch, recon_model, model, dev_loader,
                              writer, 'Val', dev_data_range_dict)

        scheduler.step()
        save_policy_model(args, args.run_dir, epoch, model, optimiser)
    writer.close()
Exemple #12
0
def main(
    num_epochs=50,
    batch_size=64,
    D=18,
    N=50000,
    w_lr=1e-4,
    w_momentum=0.9,
    w_weight_decay=0,
    a_lr=3e-4,
    a_momentum=0.9,
    a_weight_decay=0,
    T=10,
    grad_clip=1,
    logging_freq=200,
    w_checkpoint_freq=1,
    max_order_y=7,
    noise_var=0.25,
    featurize_type="fourier",
    initial_degree=100,
    hvp="finite_diff",
    arch_train_data="val",
    normalize_a_lr=True,
    w_warm_start=0,
    extra_weight_decay=0.5,
    grad_inner_loop_order=-1,
    grad_outer_loop_order=-1,
):
    config = locals()

    wandb_auth()
    wandb.init(project="NAS", group=f"Linear_SOTL", config=config)

    ### MODEL INIT
    # x, y = data_generator(N, max_order_generated=D, max_order_y=[(5,7), (9,13)], noise_var=0.25, featurize_type='fourier')
    # x, y = get_datasets("songs")

    dset_train, dset_val = get_datasets(name="MNIST",
                                        data_size=N,
                                        max_order_generated=D,
                                        max_order_y=max_order_y,
                                        noise_var=noise_var,
                                        featurize_type=featurize_type)

    model = SoTLNet(num_features=int(len(dset_train[0][0])),
                    layer_type="MNIST",
                    degree=-1,
                    weight_decay=extra_weight_decay)

    criterion = get_criterion(model_type)
    w_optimizer = SGD(model.weight_params(),
                      lr=w_lr,
                      momentum=w_momentum,
                      weight_decay=w_weight_decay)
    a_optimizer = SGD(model.arch_params(),
                      lr=a_lr,
                      momentum=a_momentum,
                      weight_decay=a_weight_decay)

    wandb.watch(model, log="all")
    train_bptt(num_epochs=num_epochs,
               model=model,
               criterion=criterion,
               w_optimizer=w_optimizer,
               a_optimizer=a_optimizer,
               dset_train=dset_train,
               dset_val=dset_val,
               logging_freq=logging_freq,
               batch_size=batch_size,
               T=T,
               grad_clip=grad_clip,
               w_lr=w_lr,
               w_checkpoint_freq=w_checkpoint_freq,
               grad_inner_loop_order=grad_inner_loop_order,
               grad_outer_loop_order=grad_outer_loop_order,
               hvp=hvp,
               arch_train_data=arch_train_data,
               normalize_a_lr=normalize_a_lr,
               log_grad_norm=True,
               log_alphas=True,
               w_warm_start=w_warm_start,
               extra_weight_decay=extra_weight_decay)
    # train_normal(num_epochs=num_epochs, model=model, dset_train=dset_train,
    #     logging_freq=logging_freq, batch_size=batch_size, grad_clip=grad_clip, optim="sgd")

    lapack_solution, res, eff_rank, sing_values = scipy.linalg.lstsq(x, y)
    print(f"Cond number:{abs(sing_values.max()/sing_values.min())}")

    val_meter = valid_func(model=model, dset_val=dset_val, criterion=criterion)

    model.fc1.weight = torch.nn.Parameter(torch.tensor(lapack_solution))

    val_meter2 = valid_func(model=model,
                            dset_val=dset_val,
                            criterion=criterion)

    print(
        f"Trained val loss: {val_meter.avg}, SciPy solver val loss: {val_meter2.avg}, difference: {val_meter.avg - val_meter2.avg} (ie. {(val_meter.avg/val_meter2.avg-1)*100}% more)"
    )

    true_degree = max_order_y / 2
    trained_degree = model.fc1.alphas.item()
    print(
        f"True degree: {true_degree}, trained degree: {trained_degree}, difference: {abs(true_degree - trained_degree)}"
    )
    wandb.run.summary["degree_mismatch"] = abs(true_degree - trained_degree)
Exemple #13
0
def train_detector(args):

    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    trainset = KittiDataset(args.data_dir, args.seq, args.npoints)
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size, \
        shuffle=True, num_workers=args.num_workers, drop_last=True)

    model = Detector(args)
    model = model.cuda()

    if args.use_wandb:
        wandb.watch(model)

    chamfer_criterion = ChamferLoss()
    point_criterion = Point2PointLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9)
    best_epoch_loss = float("inf")

    for epoch in range(args.epoch):
        torch.cuda.empty_cache()
        model.train()
        epoch_loss = 0
        epoch_chamfer_loss = 0
        epoch_point_loss = 0
        count = 0
        pbar = tqdm(enumerate(trainloader))
        for i, data in pbar:
            src_pc, src_sn, dst_pc, dst_sn, T = data
            src = torch.cat((src_pc, src_sn), dim=-1)
            dst = torch.cat((dst_pc, dst_sn), dim=-1)
            src = src.cuda()
            dst = dst.cuda()
            src_pc = src_pc.cuda()
            dst_pc = dst_pc.cuda()
            T = T.cuda()
            R = T[:, :3, :3].contiguous()
            t = T[:, :3, 3].unsqueeze(1).contiguous()

            src_kp, src_sigma, _, _ = model(src)
            dst_kp, dst_sigma, _, _ = model(dst)

            src_kp_trans = (torch.matmul(R, src_kp).permute(0, 2, 1) +
                            t).permute(0, 2, 1).contiguous()
            chamfer_loss = chamfer_criterion(src_kp_trans, dst_kp, src_sigma,
                                             dst_sigma)
            point_loss = point_criterion(
                src_kp,
                src_pc.permute(0, 2, 1).contiguous()) + point_criterion(
                    dst_kp,
                    dst_pc.permute(0, 2, 1).contiguous())
            loss = chamfer_loss + args.alpha * point_loss

            epoch_loss = epoch_loss + float(loss)
            epoch_chamfer_loss = epoch_chamfer_loss + float(chamfer_loss)
            epoch_point_loss = epoch_point_loss + float(point_loss)
            count += 1

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if i % 100 == 0:
                pbar.set_description(
                    'Train Epoch:{}[{}/{}({:.0f}%)]\tLoss: {:.6f}'.format(
                        epoch + 1, i, len(trainloader),
                        100. * i / len(trainloader), loss.item()))

        epoch_loss = epoch_loss / count
        epoch_chamfer_loss = epoch_chamfer_loss / count
        epoch_point_loss = epoch_point_loss / count
        print('Epoch {} finished. Loss: {:.3f} Chamfer loss: {:.3f} Point loss: {:.3f}'.\
            format(epoch+1, epoch_loss, epoch_chamfer_loss, epoch_point_loss))

        if args.use_wandb:
            wandb.log({
                "loss": epoch_loss,
                "chamfer loss": epoch_chamfer_loss,
                "point loss": epoch_point_loss
            })

        if not os.path.exists(args.ckpt_dir):
            os.makedirs(args.ckpt_dir)
        if epoch_loss < best_epoch_loss:
            torch.save(model.state_dict(),
                       os.path.join(args.ckpt_dir, 'best_detector.pth'))
Exemple #14
0
    def run_train(self, train_data, dev_data):
        self.print_model_parameters()

        import wandb
        wandb.init(project='smore-{}-group-{}-final'.format(
            self.args.dataset_name,
            get_no_join_tag(self.args, separator_in_front=True)),
                   group=get_wandb_group(self.args),
                   name=get_wandb_tag(self.args))
        os.environ["WANDB_RUN_GROUP"] = get_wandb_group(self.args)
        wandb.watch(self)

        if self.args.augment_with_wikisql:
            train_data_, train_data_augment = [], []
            for example in train_data:
                if example.dataset_id == WIKISQL:
                    train_data_augment.append(example)
                else:
                    train_data_.append(example)
            train_data = train_data_
            train_batch_size = round(self.train_batch_size * 0.7)
            train_augment_batch_size = self.train_batch_size - train_batch_size

            dev_data_, dev_data_augment = [], []
            for example in dev_data:
                if example.dataset_id == WIKISQL:
                    dev_data_augment.append(example)
                else:
                    dev_data_.append(example)
                dev_data = dev_data_
            print('**************************')
            print('{} training examples'.format(len(train_data)))
            print('{} augmented training examples'.format(
                len(train_data_augment)))
            print('train batch size = {}'.format(train_batch_size))
            print('train augment batch size = {}'.format(
                train_augment_batch_size))
            print('{} dev examples'.format(len(dev_data)))
            print('{} augmented dev examples'.format(len(dev_data_augment)))
            print('**************************')
        else:
            train_batch_size = self.train_batch_size
            train_augment_batch_size = 0

        # Track training losses dev metrics changes
        ############################
        epoch_losses = []
        best_dev_metrics = 0
        dev_metrics_history = []
        ############################

        all_train_data = copy.deepcopy(train_data)
        # Curriculum learning (start from easy category)
        if self.args.curriculum_interval > 0:
            # assert(self.args.curriculum_interval % self.args.num_peek_steps == 0)
            train_data = [
                exp for exp in all_train_data
                if exp.hardness in ['easy', 'medium']
            ]
            print('Curriculumn: [easy, medium] ({}) ------'.format(
                len(train_data)))

        num_steps = self.num_steps * self.num_accumulation_steps
        num_peek_steps = self.num_peek_steps * self.num_accumulation_steps
        curriculum_interval = self.args.curriculum_interval * self.num_accumulation_steps

        random.shuffle(train_data)
        if self.args.augment_with_wikisql:
            random.shuffle(train_data_augment)
            augment_example_id = 0
        step_id, example_id = 0, 0

        self.optim.zero_grad()
        self.train()

        for interval_step_id in range(self.start_step, num_steps,
                                      num_peek_steps):
            # Update model parameters
            self.train()

            for s_id in tqdm(range(num_peek_steps)):
                step_id = interval_step_id + s_id
                if self.log_in_wandb(step_id / self.num_accumulation_steps):
                    wandb.log({
                        'learning_rate/{}'.format(self.dataset):
                        self.optim.param_groups[0]['lr']
                    })
                    wandb.log({
                        'fine_tuning_rate/{}'.format(self.dataset):
                        self.optim.param_groups[1]['lr']
                    })

                batch_end = example_id + train_batch_size
                if curriculum_interval > 0 and step_id % curriculum_interval == 0 and \
                        0 < step_id / curriculum_interval <= 2:
                    if float(step_id) / curriculum_interval == 1:
                        train_data = [
                            exp for exp in all_train_data
                            if exp.hardness in ['easy', 'medium', 'hard']
                        ]
                        print('Curriculumn: [easy, medium, hard] ({}) ------'.
                              format(len(train_data)))
                    elif float(step_id) / curriculum_interval == 2:
                        train_data = all_train_data
                        print(
                            'Curriculumn: [easy, medium, hard, extra] ({}) ------'
                            .format(len(train_data)))
                    random.shuffle(train_data)
                    example_id, batch_end = 0, train_batch_size
                if batch_end > len(train_data):
                    random.shuffle(train_data)
                    example_id, batch_end = 0, train_batch_size
                mini_batch = train_data[example_id:batch_end]
                example_id = batch_end
                if self.args.augment_with_wikisql:
                    augment_batch_end = augment_example_id + train_augment_batch_size
                    if augment_batch_end > len(train_data_augment):
                        random.shuffle(train_data_augment)
                        augment_example_id, augment_batch_end = 0, train_augment_batch_size
                    mini_batch += train_data_augment[
                        augment_example_id:augment_batch_end]
                    augment_example_id = augment_batch_end

                formatted_batch = self.format_batch(mini_batch)
                loss = self.loss(formatted_batch)
                loss.backward()
                epoch_losses.append(float(loss) * self.num_accumulation_steps)

                if (step_id + 1) % self.num_accumulation_steps == 0:
                    # Gradient clipping
                    if self.grad_norm > 0:
                        nn.utils.clip_grad_norm_(self.parameters(),
                                                 self.grad_norm)
                    # Update learning rate scheduler
                    self.lr_scheduler.step()
                    # Update parameters
                    self.optim.step()
                    self.optim.zero_grad()

            # Check training statistics
            if step_id > 0 and (step_id + 1) % num_peek_steps == 0:
                stdout_msg = 'Step {}: average training loss = {}'.format(
                    step_id / self.num_accumulation_steps,
                    np.mean(epoch_losses))
                print(stdout_msg)
                wandb.log({
                    'cross_entropy_loss/{}'.format(self.dataset):
                    np.mean(epoch_losses)
                })
                epoch_losses = []

            # Check model performance
            if step_id > 0 and (step_id + 1) % num_peek_steps == 0:
                self.eval()
                if self.args.process_sql_in_execution_order:
                    pred_restored_cache = self.load_pred_restored_cache()
                    pred_restored_cache_size = sum(
                        len(v) for v in pred_restored_cache.values())
                else:
                    pred_restored_cache = None
                engine_path = os.path.join(
                    self.args.data_dir,
                    'dev.db') if self.args.dataset_name == 'wikisql' else None
                engine = DBEngine(engine_path) if engine_path else None

                output_dict = self.inference(
                    dev_data,
                    restore_clause_order=self.args.
                    process_sql_in_execution_order,
                    pred_restored_cache=pred_restored_cache,
                    check_schema_consistency_=self.args.sql_consistency_check,
                    engine=engine,
                    inline_eval=True,
                    verbose=False)
                metrics = eval_tools.get_exact_match_metrics(
                    dev_data, output_dict['pred_decoded'], engine=engine)
                dev_metrics_history.append(metrics)

                # eval_metrics = metrics['top_1_ex'] if self.args.dataset_name == 'wikisql' else metrics['top_1_em']
                eval_metrics_em = metrics['top_1_em']
                eval_metrics_exe = metrics['top_1_ex']

                wandb.log({
                    'dev_exact_match/{}'.format(self.dataset):
                    eval_metrics_em
                })
                wandb.log({
                    'dev_execution/{}'.format(self.dataset):
                    eval_metrics_exe
                })

                print('Dev set performance:')
                print('Top-1 exact match: {}'.format(metrics['top_1_em']))
                print('Top-3 exact match: {}'.format(metrics['top_3_em']))
                if self.args.dataset_name == 'wikisql':
                    print('Top-1 exe acc: {}'.format(metrics['top_1_ex']))
                    print('Top-3 exe acc: {}'.format(metrics['top_3_ex']))

                if eval_metrics_exe >= best_dev_metrics:
                    best_dev_metrics = eval_metrics_exe
                    self.save_checkpoint(step_id,
                                         step_id / num_peek_steps,
                                         output_dict['pred_decoded'],
                                         is_best=True)
                if self.args.augment_with_wikisql and (step_id + 1) % (
                        num_peek_steps * 3) == 0:
                    wikisql_output_dict = self.inference(dev_data_augment,
                                                         inline_eval=True,
                                                         verbose=False)
                    wikisql_metrics = eval_tools.get_exact_match_metrics(
                        dev_data_augment, wikisql_output_dict['pred_decoded'])
                    wandb.log({
                        'wikisql_dev_exact_match/{}'.format(self.dataset):
                        wikisql_metrics['top_1_em']
                    })
                    print('WikiSQL dev set performance:')
                    print('Top-1 exact match: {}'.format(
                        wikisql_metrics['top_1_em']))
                    print('Top-3 exact match: {}'.format(
                        wikisql_metrics['top_3_em']))
                if self.args.process_sql_in_execution_order:
                    new_pred_restored_cache_size = sum(
                        len(v)
                        for v in output_dict['pred_restored_cache'].values())
                    newly_cached_size = new_pred_restored_cache_size - pred_restored_cache_size
                    if newly_cached_size > 0:
                        self.save_pred_restored_cache(
                            output_dict['pred_restored_cache'],
                            newly_cached_size)
Exemple #15
0
def main(args):
    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))
    wandb.init(project="qpic-project",
               entity="sangbaeklee",
               group="experiment_qpic")
    wandb.config = {
        "learning_rate": args.lr,
        "epochs": args.epochs,
        "batch_size": args.batch_size,
    }

    if args.frozen_weights is not None:
        assert args.masks, "Frozen training is meant for segmentation only"
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model, criterion, postprocessors = build_model(args)
    model.to(device)
    wandb.watch(model)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module
    n_parameters = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)
    print('number of params:', n_parameters)

    param_dicts = [
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" not in n and p.requires_grad
            ]
        },
        {
            "params": [
                p for n, p in model_without_ddp.named_parameters()
                if "backbone" in n and p.requires_grad
            ],
            "lr":
            args.lr_backbone,
        },
    ]
    optimizer = torch.optim.AdamW(param_dicts,
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    dataset_train = build_dataset(image_set='train', args=args)
    dataset_val = build_dataset(image_set='val', args=args)

    if args.distributed:
        sampler_train = DistributedSampler(dataset_train)
        sampler_val = DistributedSampler(dataset_val, shuffle=False)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    batch_sampler_train = torch.utils.data.BatchSampler(sampler_train,
                                                        args.batch_size,
                                                        drop_last=True)

    data_loader_train = DataLoader(dataset_train,
                                   batch_sampler=batch_sampler_train,
                                   collate_fn=utils.collate_fn,
                                   num_workers=args.num_workers)
    data_loader_val = DataLoader(dataset_val,
                                 args.batch_size,
                                 sampler=sampler_val,
                                 drop_last=False,
                                 collate_fn=utils.collate_fn,
                                 num_workers=args.num_workers)

    if not args.hoi:
        if args.dataset_file == "coco_panoptic":
            # We also evaluate AP during panoptic training, on original coco DS
            coco_val = datasets.coco.build("val", args)
            base_ds = get_coco_api_from_dataset(coco_val)
        else:
            base_ds = get_coco_api_from_dataset(dataset_val)

    if args.frozen_weights is not None:
        checkpoint = torch.load(args.frozen_weights, map_location='cpu')
        model_without_ddp.detr.load_state_dict(checkpoint['model'])

    output_dir = Path(args.output_dir)
    if args.resume:
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
                                                            map_location='cpu',
                                                            check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'])
        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
            optimizer.load_state_dict(checkpoint['optimizer'])
            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
            args.start_epoch = checkpoint['epoch'] + 1
    elif args.pretrained:
        checkpoint = torch.load(args.pretrained, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'], strict=False)

    if args.eval:
        if args.hoi:
            test_stats = evaluate_hoi(args.dataset_file, model, postprocessors,
                                      data_loader_val,
                                      args.subject_category_id, device)
            return
        else:
            test_stats, coco_evaluator = evaluate(model, criterion,
                                                  postprocessors,
                                                  data_loader_val, base_ds,
                                                  device, args.output_dir)
            if args.output_dir:
                utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval,
                                     output_dir / "eval.pth")
            return

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)
        train_stats = train_one_epoch(model, criterion, data_loader_train,
                                      optimizer, device, epoch,
                                      args.clip_max_norm)
        lr_scheduler.step()
        if args.output_dir:
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            # extra checkpoint before LR drop and every 100 epochs
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0:
                checkpoint_paths.append(output_dir /
                                        f'checkpoint{epoch:04}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(
                    {
                        'model': model_without_ddp.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'epoch': epoch,
                        'args': args,
                    }, checkpoint_path)

        if args.hoi:
            test_stats = evaluate_hoi(args.dataset_file, model, postprocessors,
                                      data_loader_val,
                                      args.subject_category_id, device)
            coco_evaluator = None
        else:
            test_stats, coco_evaluator = evaluate(model, criterion,
                                                  postprocessors,
                                                  data_loader_val, base_ds,
                                                  device, args.output_dir)

        log_stats = {
            **{f'train_{k}': v
               for k, v in train_stats.items()},
            **{f'test_{k}': v
               for k, v in test_stats.items()}, 'epoch': epoch,
            'n_parameters': n_parameters
        }
        #import pdb; pdb.set_trace()
        if args.dataset_file == 'hico' or args.dataset_file == 'hico_second':
            wandb.log({
                "loss": train_stats['loss'],
                "mAP": test_stats['mAP'],
                "mAP rare": test_stats['mAP rare'],
                "mAP non-rare": test_stats['mAP non-rare'],
                "mean max recall": test_stats['mean max recall']
            })
        elif args.dataset_file == 'vcoco':
            wandb.log({
                "mAP_all": test_stats['mAP_all'],
                "mAP_thesis": test_stats['mAP_thesis'],
                "AP_hold_obj": test_stats['AP_hold_obj'],
                "AP_stand": test_stats['AP_stand'],
                "AP_sit_instr": test_stats['AP_sit_instr'],
                "AP_ride_instr": test_stats['AP_ride_instr'],
                "AP_walk": test_stats['AP_walk'],
                "AP_look_obj": test_stats['AP_look_obj'],
                "AP_hit_instr": test_stats['AP_hit_instr'],
                "AP_hit_obj": test_stats['AP_hit_obj'],
                "AP_eat_obj": test_stats['AP_eat_obj'],
                "AP_eat_instr": test_stats['AP_eat_instr'],
                "AP_jump_instr": test_stats['AP_jump_instr'],
                "AP_lay_instr": test_stats['AP_lay_instr'],
                "AP_talk_on_phone_instr": test_stats['AP_talk_on_phone_instr'],
                "AP_carry_obj": test_stats['AP_carry_obj'],
                "AP_throw_obj": test_stats['AP_throw_obj'],
                "AP_catch_obj": test_stats['AP_catch_obj'],
                "AP_cut_instr": test_stats['AP_cut_instr'],
                "AP_cut_obj": test_stats['AP_cut_obj'],
                "AP_run": test_stats['AP_run'],
                "AP_work_on_computer_instr": test_stats['AP_work_on_computer_instr'],
                "AP_ski_instr": test_stats['AP_ski_instr'],
                "AP_surf_instr": test_stats['AP_surf_instr'],
                "AP_skateboard_instr": test_stats['AP_skateboard_instr'],
                "AP_smile": test_stats['AP_smile'],
                "AP_drink_instr": test_stats['AP_drink_instr'],
                "AP_kick_obj": test_stats['AP_kick_obj'],
                "AP_point_instr": test_stats['AP_point_instr'],
                "AP_read_obj": test_stats['AP_read_obj'],
                "AP_snowboard_instr": test_stats['AP_snowboard_instr'],\
                "loss" : train_stats['loss']
            })
        else:
            continue

        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")

            # for evaluation logs
            if coco_evaluator is not None:
                (output_dir / 'eval').mkdir(exist_ok=True)
                if "bbox" in coco_evaluator.coco_eval:
                    filenames = ['latest.pth']
                    if epoch % 50 == 0:
                        filenames.append(f'{epoch:03}.pth')
                    for name in filenames:
                        torch.save(coco_evaluator.coco_eval["bbox"].eval,
                                   output_dir / "eval" / name)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
Exemple #16
0
    def train(self,
              train_dataset,
              output_dir,
              show_running_loss=True,
              eval_data=None,
              verbose=True):
        """
        Trains the model on train_dataset.

        Utility function to be used by the train_model() method. Not intended to be used directly.
        """

        device = self.device
        model = self.model
        args = self.args

        tb_writer = SummaryWriter(logdir=args["tensorboard_dir"])
        train_sampler = RandomSampler(train_dataset)
        train_dataloader = DataLoader(train_dataset,
                                      sampler=train_sampler,
                                      batch_size=args["train_batch_size"])

        t_total = len(train_dataloader) // args[
            "gradient_accumulation_steps"] * args["num_train_epochs"]

        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                args["weight_decay"],
            },
            {
                "params": [
                    p for n, p in model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0,
            },
        ]

        warmup_steps = math.ceil(t_total * args["warmup_ratio"])
        args["warmup_steps"] = warmup_steps if args[
            "warmup_steps"] == 0 else args["warmup_steps"]

        optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=args["learning_rate"],
            eps=args["adam_epsilon"],
        )
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=args["warmup_steps"],
            num_training_steps=t_total)

        if args["fp16"]:
            try:
                from apex import amp
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
                )

            model, optimizer = amp.initialize(model,
                                              optimizer,
                                              opt_level=args["fp16_opt_level"])

        if args["n_gpu"] > 1:
            model = torch.nn.DataParallel(model)

        global_step = 0
        tr_loss, logging_loss = 0.0, 0.0
        model.zero_grad()
        train_iterator = trange(int(args["num_train_epochs"]),
                                desc="Epoch",
                                disable=args["silent"])
        epoch_number = 0
        best_eval_loss = None
        early_stopping_counter = 0

        if args["evaluate_during_training"]:
            training_progress_scores = self._create_training_progress_scores()

        if args["wandb_project"]:
            wandb.init(project=args["wandb_project"],
                       config={**args},
                       **args["wandb_kwargs"])
            wandb.watch(self.model)

        model.train()
        for _ in train_iterator:
            # epoch_iterator = tqdm(train_dataloader, desc="Iteration")
            for step, batch in enumerate(
                    tqdm(train_dataloader,
                         desc="Current iteration",
                         disable=args["silent"])):
                batch = tuple(t.to(device) for t in batch)

                inputs = self._get_inputs_dict(batch)

                outputs = model(**inputs)
                # model outputs are always tuple in pytorch-transformers (see doc)
                loss = outputs[0]

                if args["n_gpu"] > 1:
                    loss = loss.mean(
                    )  # mean() to average on multi-gpu parallel training

                current_loss = loss.item()

                if show_running_loss:
                    print("\rRunning loss: %f" % loss, end="")

                if args["gradient_accumulation_steps"] > 1:
                    loss = loss / args["gradient_accumulation_steps"]

                if args["fp16"]:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                    # torch.nn.utils.clip_grad_norm_(
                    #     amp.master_params(optimizer), args["max_grad_norm"]
                    # )
                else:
                    loss.backward()
                    # torch.nn.utils.clip_grad_norm_(
                    #     model.parameters(), args["max_grad_norm"]
                    # )

                tr_loss += loss.item()
                if (step + 1) % args["gradient_accumulation_steps"] == 0:
                    if args["fp16"]:
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer),
                            args["max_grad_norm"])
                    else:
                        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       args["max_grad_norm"])

                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1

                    if args["logging_steps"] > 0 and global_step % args[
                            "logging_steps"] == 0:
                        # Log metrics
                        tb_writer.add_scalar("lr",
                                             scheduler.get_lr()[0],
                                             global_step)
                        tb_writer.add_scalar(
                            "loss",
                            (tr_loss - logging_loss) / args["logging_steps"],
                            global_step,
                        )
                        logging_loss = tr_loss
                        if args["wandb_project"]:
                            wandb.log({
                                "Training loss": current_loss,
                                "lr": scheduler.get_lr()[0],
                                "global_step": global_step,
                            })

                    if args["save_steps"] > 0 and global_step % args[
                            "save_steps"] == 0:
                        # Save model checkpoint
                        output_dir_current = os.path.join(
                            output_dir, "checkpoint-{}".format(global_step))

                        self._save_model(output_dir_current, model=model)

                    if args["evaluate_during_training"] and (
                            args["evaluate_during_training_steps"] > 0
                            and global_step %
                            args["evaluate_during_training_steps"] == 0):
                        # Only evaluate when single GPU otherwise metrics may not average well
                        results, _ = self.eval_model(eval_data, verbose=True)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value,
                                                 global_step)

                        output_dir_current = os.path.join(
                            output_dir, "checkpoint-{}".format(global_step))

                        if args["save_eval_checkpoints"]:
                            self._save_model(output_dir_current,
                                             model=model,
                                             results=results)

                        training_progress_scores["global_step"].append(
                            global_step)
                        training_progress_scores["train_loss"].append(
                            current_loss)
                        for key in results:
                            training_progress_scores[key].append(results[key])
                        report = pd.DataFrame(training_progress_scores)
                        report.to_csv(
                            args["output_dir"] +
                            "training_progress_scores.csv",
                            index=False,
                        )

                        if args["wandb_project"]:
                            wandb.log(
                                self._get_last_metrics(
                                    training_progress_scores))

                        if not best_eval_loss:
                            best_eval_loss = results["eval_loss"]
                            self._save_model(args["best_model_dir"],
                                             model=model,
                                             results=results)
                        elif results["eval_loss"] - best_eval_loss < args[
                                "early_stopping_delta"]:
                            best_eval_loss = results["eval_loss"]
                            self._save_model(args["best_model_dir"],
                                             model=model,
                                             results=results)
                            early_stopping_counter = 0
                        else:
                            if args["use_early_stopping"]:
                                if early_stopping_counter < args[
                                        "early_stopping_patience"]:
                                    early_stopping_counter += 1
                                    if verbose:
                                        print()
                                        print(
                                            f"No improvement in eval_loss for {early_stopping_counter} steps."
                                        )
                                        print(
                                            f"Training will stop at {args['early_stopping_patience']} steps."
                                        )
                                        print()
                                else:
                                    if verbose:
                                        print()
                                        print(
                                            f"Patience of {args['early_stopping_patience']} steps reached."
                                        )
                                        print("Training terminated.")
                                        print()
                                    return global_step, tr_loss / global_step

            epoch_number += 1
            output_dir_current = os.path.join(
                output_dir,
                "checkpoint-{}-epoch-{}".format(global_step, epoch_number))

            if args["save_model_every_epoch"] or args[
                    "evaluate_during_training"]:
                os.makedirs(output_dir_current, exist_ok=True)

            if args["save_model_every_epoch"]:
                self._save_model(output_dir_current, model=model)

            if args["evaluate_during_training"]:
                results, _ = self.eval_model(eval_data, verbose=True)

                self._save_model(output_dir_current, results=results)

                training_progress_scores["global_step"].append(global_step)
                training_progress_scores["train_loss"].append(current_loss)
                for key in results:
                    training_progress_scores[key].append(results[key])
                report = pd.DataFrame(training_progress_scores)
                report.to_csv(args["output_dir"] +
                              "training_progress_scores.csv",
                              index=False)

                if not best_eval_loss:
                    best_eval_loss = results["eval_loss"]
                    self._save_model(args["best_model_dir"],
                                     model=model,
                                     results=results)
                elif results["eval_loss"] - best_eval_loss < args[
                        "early_stopping_delta"]:
                    best_eval_loss = results["eval_loss"]
                    self._save_model(args["best_model_dir"],
                                     model=model,
                                     results=results)
                    early_stopping_counter = 0
                else:
                    if args["use_early_stopping"]:
                        if early_stopping_counter < args[
                                "early_stopping_patience"]:
                            early_stopping_counter += 1
                            if verbose:
                                print()
                                print(
                                    f"No improvement in eval_loss for {early_stopping_counter} steps."
                                )
                                print(
                                    f"Training will stop at {args['early_stopping_patience']} steps."
                                )
                                print()
                        else:
                            if verbose:
                                print()
                                print(
                                    f"Patience of {args['early_stopping_patience']} steps reached."
                                )
                                print("Training terminated.")
                                print()
                            return global_step, tr_loss / global_step

        return global_step, tr_loss / global_step
Exemple #17
0
    def train(self,
              train_dataset,
              output_dir,
              show_running_loss=True,
              eval_df=None):
        """
        Trains the model on train_dataset.

        Utility function to be used by the train_model() method. Not intended to be used directly.
        """

        tokenizer = self.tokenizer
        device = self.device
        model = self.model
        args = self.args

        tb_writer = SummaryWriter(logdir=args["tensorboard_folder"])
        train_sampler = RandomSampler(train_dataset)
        train_dataloader = DataLoader(train_dataset,
                                      sampler=train_sampler,
                                      batch_size=args["train_batch_size"])

        t_total = len(train_dataloader) // args[
            "gradient_accumulation_steps"] * args["num_train_epochs"]

        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [{
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args["weight_decay"]
        }, {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        }]

        warmup_steps = math.ceil(t_total * args["warmup_ratio"])
        args["warmup_steps"] = warmup_steps if args[
            "warmup_steps"] == 0 else args["warmup_steps"]

        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args["learning_rate"],
                          eps=args["adam_epsilon"])
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=args["warmup_steps"],
            num_training_steps=t_total)

        if args["fp16"]:
            try:
                from apex import amp
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
                )

            model, optimizer = amp.initialize(model,
                                              optimizer,
                                              opt_level=args["fp16_opt_level"])

        if args["n_gpu"] > 1:
            model = torch.nn.DataParallel(model)

        global_step = 0
        tr_loss, logging_loss = 0.0, 0.0
        model.zero_grad()
        train_iterator = trange(int(args["num_train_epochs"]),
                                desc="Epoch",
                                disable=args['silent'])
        epoch_number = 0
        if args['evaluate_during_training']:
            training_progress_scores = {
                'global_step': [],
                'precision': [],
                'recall': [],
                'f1_score': [],
                'train_loss': [],
                'eval_loss': [],
            }

        if args['wandb_project']:
            argwandb.init(project=args['wandb_project'],
                          config={**args},
                          **args['wandb_kwargs'])
            wandb.watch(self.model)

        model.train()
        for _ in train_iterator:
            # epoch_iterator = tqdm(train_dataloader, desc="Iteration")
            for step, batch in enumerate(
                    tqdm(train_dataloader,
                         desc="Current iteration",
                         disable=args['silent'])):
                batch = tuple(t.to(device) for t in batch)

                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "labels": batch[3]
                }
                # XLM and RoBERTa don"t use segment_ids
                if args['model_type'] in ["bert", "xlnet"]:
                    inputs["token_type_ids"] = batch[2]

                outputs = model(**inputs)
                # model outputs are always tuple in pytorch-transformers (see doc)
                loss = outputs[0]

                if args['n_gpu'] > 1:
                    loss = loss.mean(
                    )  # mean() to average on multi-gpu parallel training

                current_loss = loss.item()

                if show_running_loss:
                    logger.info("\rRunning loss: %f" % loss, end="")

                if args["gradient_accumulation_steps"] > 1:
                    loss = loss / args["gradient_accumulation_steps"]

                if args["fp16"]:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args["max_grad_norm"])
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args["max_grad_norm"])

                tr_loss += loss.item()
                if (step + 1) % args["gradient_accumulation_steps"] == 0:
                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1

                    if args["logging_steps"] > 0 and global_step % args[
                            "logging_steps"] == 0:
                        # Log metrics
                        tb_writer.add_scalar("lr",
                                             scheduler.get_lr()[0],
                                             global_step)
                        tb_writer.add_scalar("loss", (tr_loss - logging_loss) /
                                             args["logging_steps"],
                                             global_step)
                        logging_loss = tr_loss
                        if args['wandb_project']:
                            wandb.log({
                                'Training loss': current_loss,
                                'lr': scheduler.get_lr()[0],
                                'global_step': global_step
                            })

                    if args["save_steps"] > 0 and global_step % args[
                            "save_steps"] == 0:
                        # Save model checkpoint
                        output_dir_current = os.path.join(
                            output_dir, "checkpoint-{}".format(global_step))

                        if not os.path.exists(output_dir_current):
                            os.makedirs(output_dir_current)

                        # Take care of distributed/parallel training
                        model_to_save = model.module if hasattr(
                            model, "module") else model
                        model_to_save.save_pretrained(output_dir_current)
                        self.tokenizer.save_pretrained(output_dir_current)

                    if args['evaluate_during_training'] and (
                            args["evaluate_during_training_steps"] > 0
                            and global_step %
                            args["evaluate_during_training_steps"] == 0):
                        # Only evaluate when single GPU otherwise metrics may not average well
                        results, _, _ = self.eval_model(eval_df, verbose=True)
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value,
                                                 global_step)

                        output_dir_current = os.path.join(
                            output_dir, "checkpoint-{}".format(global_step))

                        if not os.path.exists(output_dir_current):
                            os.makedirs(output_dir_current)

                        if args['save_eval_checkpoints']:
                            model_to_save = model.module if hasattr(
                                model, "module") else model
                            model_to_save.save_pretrained(output_dir_current)
                            self.tokenizer.save_pretrained(output_dir_current)

                        output_eval_file = os.path.join(
                            output_dir_current, "eval_results.txt")
                        with open(output_eval_file, "w") as writer:
                            for key in sorted(results.keys()):
                                writer.write("{} = {}\n".format(
                                    key, str(results[key])))

                        training_progress_scores['global_step'].append(
                            global_step)
                        training_progress_scores['train_loss'].append(
                            current_loss)
                        for key in results:
                            training_progress_scores[key].append(results[key])
                        report = pd.DataFrame(training_progress_scores)
                        report.to_csv(args['output_dir'] +
                                      'training_progress_scores.csv',
                                      index=False)

                        if args['wandb_project']:
                            wandb.log(
                                self._get_last_metrics(
                                    training_progress_scores))

            epoch_number += 1
            output_dir_current = os.path.join(output_dir,
                                              "epoch-{}".format(epoch_number))

            if not os.path.exists(output_dir_current):
                os.makedirs(output_dir_current)

            model_to_save = model.module if hasattr(model, "module") else model
            model_to_save.save_pretrained(output_dir_current)
            self.tokenizer.save_pretrained(output_dir_current)

            if args['evaluate_during_training']:
                results, _, _ = self.eval_model(eval_df, verbose=True)

                output_eval_file = os.path.join(output_dir_current,
                                                "eval_results.txt")
                with open(output_eval_file, "w") as writer:
                    for key in sorted(results.keys()):
                        writer.write("{} = {}\n".format(
                            key, str(results[key])))

        return global_step, tr_loss / global_step
Exemple #18
0
def train_transformer_style(
        model: PyTorchForecast,
        training_params: Dict,
        takes_target=False,
        forward_params: Dict = {},
        model_filepath: str = "model_save") -> None:
    """
    Function to train any PyTorchForecast model
    :model The initialized PyTorchForecastModel
    :training_params_dict A dictionary of the parameters needed to train model
    :takes_target boolean: Determines whether to pass target during training
    :forward_params: A dictionary for additional forward parameters (for instance target)
    """
    use_wandb = model.wandb
    es = None
    if "early_stopping" in model.params:
        es = EarlyStopper(model.params["early_stopping"]['patience'])
    opt = pytorch_opt_dict[training_params["optimizer"]](
        model.model.parameters(), **training_params["optim_params"])
    criterion_init_params = {}
    if "criterion_params" in training_params:
        criterion_init_params = training_params["criterion_params"]
    criterion = pytorch_criterion_dict[training_params["criterion"]](**criterion_init_params)
    max_epochs = training_params["epochs"]
    data_loader = DataLoader(
        model.training,
        batch_size=training_params["batch_size"],
        shuffle=False,
        sampler=None,
        batch_sampler=None,
        num_workers=0,
        collate_fn=None,
        pin_memory=False,
        drop_last=False,
        timeout=0,
        worker_init_fn=None)
    validation_data_loader = DataLoader(
        model.validation,
        batch_size=training_params["batch_size"],
        shuffle=False,
        sampler=None,
        batch_sampler=None,
        num_workers=0,
        collate_fn=None,
        pin_memory=False,
        drop_last=False,
        timeout=0,
        worker_init_fn=None)
    test_data_loader = DataLoader(model.test_data, batch_size=1, shuffle=False, sampler=None,
                                  batch_sampler=None, num_workers=0, collate_fn=None,
                                  pin_memory=False, drop_last=False, timeout=0,
                                  worker_init_fn=None)
    meta_model = None
    meta_representation = None
    if model.params.get("meta_data") is None:
        model.params["meta_data"] = False
    if model.params["meta_data"]:
        with open(model.params["meta_data"]["path"]) as f:
            json_data = json.load(f)
        dataset_params2 = json_data["dataset_params"]
        training_path = dataset_params2["training_path"]
        valid_path = dataset_params2["validation_path"]
        meta_name = json_data["model_name"]
        meta_model = PyTorchForecast(meta_name, training_path, valid_path, dataset_params2["test_path"], json_data)
        meta_representation = get_meta_representation(model.params["meta_data"]["column_id"],
                                                      model.params["meta_data"]["uuid"], meta_model)
    if use_wandb:
        wandb.watch(model.model)
    session_params = []
    for epoch in range(max_epochs):
        total_loss = torch_single_train(
            model,
            opt,
            criterion,
            data_loader,
            takes_target,
            meta_model,
            meta_representation,
            forward_params)
        print("The loss for epoch " + str(epoch))
        print(total_loss)
        use_decoder = False
        if "use_decoder" in model.params:
            use_decoder = True
        valid = compute_validation(
            validation_data_loader,
            model.model,
            epoch,
            model.params["dataset_params"]["forecast_length"],
            criterion,
            model.device,
            meta_model=meta_model,
            decoder_structure=use_decoder,
            use_wandb=use_wandb)
        if valid < 0.01:
            raise("Error validation loss is zero there is a problem with the validator.")
        if use_wandb:
            wandb.log({'epoch': epoch, 'loss': total_loss})
        epoch_params = {
            "epoch": epoch,
            "train_loss": str(total_loss),
            "validation_loss": str(valid)}
        session_params.append(epoch_params)
        if es:
            if not es.check_loss(model.model, valid):
                print("Stopping model now")
                model.model.load_state_dict(torch.load("checkpoint.pth"))
                break
    decoder_structure = True
    if model.params["dataset_params"]["class"] != "default":
        decoder_structure = False
    test = compute_validation(
        test_data_loader,
        model.model,
        epoch,
        model.params["dataset_params"]["forecast_length"],
        criterion,
        model.device,
        meta_model=meta_model,
        decoder_structure=decoder_structure,
        use_wandb=use_wandb,
        val_or_test="test_loss")
    print("test loss:", test)
    model.params["run"] = session_params
    model.save_model(model_filepath, max_epochs)
Exemple #19
0
    def train_model(self, epochs, dataset, save_folder, batch_size=1, cache=False, epochs_per_checkpoint=5,
                    dis_train_amount=3, iters=None,wdb=True, tb=True, ray=False,local_dir="../"):
        self.local_dir = local_dir
        # Make a writer for Tensorboard
        if tb:
            writer = SummaryWriter()
        # Use wandb for watching the model
        if wdb:
            wandb.init(project="retrogan")
            wandb.run.name = self.name
            wandb.watch(self, criterion="simlex")
            wandb.run.save()

        res = []
        self.set_fp16()
        self.to_device(self.device)
        class RetroPairsDataset(Dataset):
            """Dataset of pairs of embeddings consisting of the distributional and its retrofitted counterpart."""

            def __init__(self, original_dataset, retrofitted_dataset, save_folder, cache):
                # Load the data.
                X_train, Y_train = helpertools.load_all_words_dataset_final(original_dataset, retrofitted_dataset,
                                                                                 save_folder=save_folder, cache=cache)
                print("Shapes of training data:",
                      X_train.shape,
                      Y_train.shape)
                print(X_train)
                print(Y_train)
                print("*" * 100)
                self.x = X_train
                self.y = Y_train

            def __len__(self):
                return self.x.shape[0]

            def __getitem__(self, idx):
                # We normalize the embeddings that we utilize
                imgs_A = np.array(self.x.iloc[idx], dtype=np.float)
                imgs_B = np.array(self.y.iloc[idx], dtype=np.float)
                imgs_A /= np.linalg.norm(imgs_A)
                imgs_B /= np.linalg.norm(imgs_B)
                return torch.from_numpy(imgs_A), torch.from_numpy(imgs_B)

        # Initialize the dataset
        ds = RetroPairsDataset(dataset["original"], dataset["retrofitted"],
                               save_folder=save_folder, cache=cache)
        # Create our data loader
        dataloader = DataLoader(ds, batch_size=batch_size,
                                shuffle=True, num_workers=0)

        # Initialize our models optimizers
        self.compile_all()

        def train_step(self, batch_i, imgs_A, imgs_B, epoch, count, training_epochs):

                if imgs_A.shape[0] == 1:
                    print("Batch is equal to 1 in training.")
                    return
                a = datetime.datetime.now()
                imgs_A = imgs_A.to(self.device)
                imgs_B = imgs_B.to(self.device)

                imgs_A = imgs_A.half() if self.fp16 else imgs_A.float()
                imgs_B = imgs_B.half() if self.fp16 else imgs_B.float()

                with torch.cuda.amp.autocast():
                    fake_B = self.g_AB(imgs_A)
                    fake_A = self.g_BA(imgs_B)
                # Train the discriminators (original images = real / translated = Fake)
                dA_loss = None
                dB_loss = None
                valid = torch.ones((imgs_A.shape[0], 1)).to(self.device)  # *noisy_entries_num,) )
                fake = torch.zeros((imgs_A.shape[0], 1)).to(self.device)  # *noisy_entries_num,) )
                # accs = []
                b = datetime.datetime.now()
                # print("Data prep time",b-a)
                # TRAIN THE DISCRIMINATORS
                a = datetime.datetime.now()
                if False:
                    for _ in range(int(dis_train_amount)):
                        if _ % 2 == 0:
                            # print("Adding noise")
                            i_A = imgs_A + torch.tensor(
                                np.random.uniform(low=-1, size=(imgs_A.shape[0], self.word_vector_dimensions)),
                                device=imgs_A.device).half()
                            i_B = imgs_B + torch.tensor(
                                np.random.uniform(low=-1, size=(imgs_A.shape[0], self.word_vector_dimensions)),
                                device=imgs_B.device).half()
                            f_A = fake_A + torch.tensor(
                                np.random.uniform(low=-1, size=(imgs_A.shape[0], self.word_vector_dimensions)),
                                device=fake_A.device).half()
                            f_B = fake_B + torch.tensor(
                                np.random.uniform(low=-1, size=(imgs_A.shape[0], self.word_vector_dimensions)),
                                device=fake_B.device).half()
                        else:
                            i_A = imgs_A
                            i_B = imgs_B
                            f_B = fake_B
                            f_A = fake_A
                        # with torch.no_grad():
                        # TRAIN ON BATCH VALID
                        self.dA_optimizer.zero_grad()
                        dA = self.d_A(i_A)
                        dA_loss_real = nn.BCEWithLogitsLoss()(dA, valid)

                        if self.fp16:
                            self.dA_optimizerscaler.scale(dA_loss_real).backward()
                            self.dA_optimizerscaler.step(self.dA_optimizer)
                            self.dA_optimizerscaler.update()
                        else:
                            dA_loss_real.backward(retain_graph=True)
                            self.dA_optimizer.step()
                        # TRAIN ON BATCH FAKE
                        self.dA_optimizer.zero_grad()
                        dA_f = self.d_A(f_A)
                        dA_loss_fake = nn.BCEWithLogitsLoss()(dA_f, fake)

                        if self.fp16:
                            self.dA_optimizerscaler.scale(dA_loss_fake).backward(retain_graph=True)
                            self.dA_optimizerscaler.step(self.dA_optimizer)
                            self.dA_optimizerscaler.update()
                        else:
                            dA_loss_fake.backward(retain_graph=True)
                            self.dA_optimizer.step()

                        if dA_loss is None:
                            dA_loss = 0.5 * (float(dA_loss_real) + float(dA_loss_fake))
                        else:
                            dA_loss += 0.5 * (float(dA_loss_real) + float(dA_loss_fake))

                        # TRAIN ON BATCH VALID
                        self.dB_optimizer.zero_grad()
                        dB = self.d_B(i_B)
                        dB_loss_real = nn.BCEWithLogitsLoss()(dB, valid)
                        if self.fp16:
                            self.dB_optimizerscaler.scale(dB_loss_real).backward()
                            self.dB_optimizerscaler.step(self.dB_optimizer)
                            self.dB_optimizerscaler.update()
                        else:
                            dB_loss_real.backward(retain_graph=True)
                            self.dB_optimizer.step()

                        # TRAIN ON BATCH FAKE
                        self.dB_optimizer.zero_grad()
                        dB_f = self.d_B(f_B)
                        dB_loss_fake = nn.BCEWithLogitsLoss()(dB_f, fake)

                        if self.fp16:
                            self.dB_optimizerscaler.scale(dB_loss_fake).backward(retain_graph=True)
                            self.dB_optimizerscaler.step(self.dB_optimizer)
                            self.dB_optimizerscaler.update()
                        else:
                            dB_loss_fake.backward(retain_graph=True)
                            self.dB_optimizer.step()

                        # dB_loss_real = self.d_B.train_on_batch(retrofitted_embeddings, valid)
                        # dB_loss_fake = self.d_B.train_on_batch(fake_B, fake)
                        if dB_loss is None:
                            dB_loss = 0.5 * (dB_loss_real.item() + dB_loss_fake.item())
                        else:
                            dB_loss += 0.5 * (dB_loss_real.item() + dB_loss_fake.item())
                else:
                    dA_loss = 0
                    dB_loss = 0
                # ABBA
                b = datetime.datetime.now()
                d_loss = (1.0 / dis_train_amount) * 0.5 * np.add(dA_loss, dB_loss)

                # print("Dis train time", b - a)
                # TRAIN THE CYCLE DISCRIMINATORS
                if self.cycle_dis:
                    a = datetime.datetime.now()
                    with torch.cuda.amp.autocast():
                        fake_ABBA = self.g_BA(fake_B)
                        fake_BAAB = self.g_AB(fake_A)
                    self.dABBA_optimizer.zero_grad()
                    with torch.cuda.amp.autocast():
                        dA = self.d_ABBA(torch.cat([fake_B, imgs_A], 1))
                        dA_r = self.d_ABBA(torch.cat([fake_B, fake_ABBA], 1))
                    dABBA_loss_real = CycleCond_Loss()(dA, dA_r)
                    # dABBA_loss_real = nn.BCEWithLogitsLoss()(dA, valid)
                    if self.fp16:
                        self.dABBA_optimizerscaler.scale(dABBA_loss_real).backward()
                        self.dABBA_optimizerscaler.step(self.dABBA_optimizer)
                        self.dABBA_optimizerscaler.update()
                    else:
                        dABBA_loss_real.backward()
                        self.dABBA_optimizer.step()

                    self.dBAAB_optimizer.zero_grad()
                    with torch.cuda.amp.autocast():
                        dB = self.d_BAAB(torch.cat([fake_A, imgs_B], 1))
                        dB_r = self.d_BAAB(torch.cat([fake_A, fake_BAAB], 1))
                    dBAAB_loss_real = CycleCond_Loss()(dB, dB_r)
                    # dABBA_loss_real = nn.BCEWithLogitsLoss()(dA, valid)
                    if self.fp16:
                        self.dBAAB_optimizerscaler.scale(dBAAB_loss_real).backward()
                        self.dBAAB_optimizerscaler.step(self.dBAAB_optimizer)
                        self.dBAAB_optimizerscaler.update()
                    else:
                        dBAAB_loss_real.backward()
                        self.dBAAB_optimizer.step()

                    d_cycle_loss = 0.5 * (dBAAB_loss_real.item() + dABBA_loss_real.item())
                    b = datetime.datetime.now()
                    # print("Cycle discriminator train time", b - a)

                else:
                    d_cycle_loss = 0
                # Calculate the max margin loss for A->B, B->A
                ## Max margin AB and BA
                if self.one_way_mm:
                    self.g_AB_optimizer.zero_grad()
                    a = datetime.datetime.now()
                    with torch.cuda.amp.autocast():
                        mm_a = self.g_AB(imgs_A)
                    mm_a_loss = MaxMargin_Loss(batch_size=imgs_A.shape[0])(mm_a, imgs_B)

                    # Calling the step function on an Optimizer makes an update to its
                    # parameters
                    if self.fp16:
                        self.g_AB_optimizerscaler.scale(mm_a_loss).backward()
                        self.g_AB_optimizerscaler.step(self.g_AB_optimizer)
                        self.g_AB_optimizerscaler.update()
                    else:
                        mm_a_loss.backward(retain_graph=True)
                        self.g_AB_optimizer.step()
                    mm_a_loss = mm_a_loss.item()

                    self.g_BA_optimizer.zero_grad()
                    with torch.cuda.amp.autocast():
                        mm_b = self.g_BA(imgs_B)
                    mm_b_loss = MaxMargin_Loss(batch_size=imgs_A.shape[0])(mm_b, imgs_A)
                    if self.fp16:
                        self.g_BA_optimizerscaler.scale(mm_b_loss).backward()
                        self.g_BA_optimizerscaler.step(self.g_BA_optimizer)
                        self.g_BA_optimizerscaler.update()
                    else:
                        mm_b_loss.backward()
                        self.g_BA_optimizer.step()
                    mm_b_loss = mm_b_loss.item()
                    b = datetime.datetime.now()
                    # print("MM one way discriminator train time", b - a)


                else:
                    mm_a_loss = mm_b_loss = 0
                # Calculate the cycle A->B->A, B->A->B with max margin, and mae
                a = datetime.datetime.now()
                self.combined_optimizer.zero_grad()
                with torch.cuda.amp.autocast():
                    fake_B = self.g_AB(imgs_A)
                    fake_A = self.g_BA(imgs_B)
                    # with torch.no_grad():
                    valid_A = self.d_A(fake_A)
                    valid_B = self.d_B(fake_B)
                valid_A_loss = nn.BCEWithLogitsLoss()(valid_A, valid)
                valid_B_loss = nn.BCEWithLogitsLoss()(valid_B, valid)
                id_a = fake_B
                id_b = fake_A
                if self.id_loss:
                    gamma = 1.0
                    mae_id_abba = gamma * torch.nn.L1Loss()(id_a, imgs_A)
                    mae_id_baab = gamma * torch.nn.L1Loss()(id_b, imgs_B)
                else:
                    mae_id_abba = mae_id_baab = 0
                with torch.cuda.amp.autocast():
                    fake_ABBA = self.g_BA(fake_B)
                    fake_BAAB = self.g_AB(fake_A)
                if self.cycle_mm:
                    mm_abba = MaxMargin_Loss(batch_size=imgs_A.shape[0])(fake_ABBA, imgs_A)
                    mm_baab = MaxMargin_Loss(batch_size=imgs_A.shape[0])(fake_BAAB, imgs_B)
                else:
                    mm_abba = mm_baab = 0

                if self.cycle_loss:
                    mae_abba = torch.nn.L1Loss()(fake_ABBA, imgs_A)
                    mae_baab = torch.nn.L1Loss()(fake_BAAB, imgs_B)
                else:
                    mae_abba = 0
                    mae_baab = 0
                if self.cycle_dis:
                    with torch.cuda.amp.autocast():
                        dA = self.d_ABBA(torch.cat([fake_B, imgs_A], 1))
                        dA_r = self.d_ABBA(torch.cat([fake_B, fake_ABBA], 1))
                        dABBA_loss_real = CycleCond_Loss()(dA, dA_r)
                        dB = self.d_BAAB(torch.cat([fake_A, imgs_B], 1))
                        dB_r = self.d_BAAB(torch.cat([fake_A, fake_BAAB], 1))
                    dBAAB_loss_real = CycleCond_Loss()(dB, dB_r)
                else:
                    dABBA_loss_real = 0
                    dBAAB_loss_real = 0
                g_loss = valid_A_loss + valid_B_loss + \
                         self.cycle_mm_weight * mm_abba + self.cycle_mm_weight * mm_baab + \
                         mae_abba + mae_baab + \
                         self.id_loss_weight * mae_id_abba + self.id_loss_weight * mae_id_baab + \
                         dBAAB_loss_real + dABBA_loss_real
                if self.fp16:
                    self.combined_optimizerscaler.scale(g_loss).backward()
                    self.combined_optimizerscaler.step(self.combined_optimizer)
                    self.combined_optimizerscaler.update()
                else:
                    g_loss.backward()
                    self.combined_optimizer.step()
                b = datetime.datetime.now()
                # print("Combined gen train time", b - a)

                if batch_i % 50 == 0 and batch_i != 0:
                    print(
                        "Epoch", epoch, "/", training_epochs,
                        "Batch:", batch_i, len(dataloader),
                        "Global Step", count,
                        "Discriminator loss:", d_loss,
                        # "Discriminator acc:", "{:.2f}".format(100 * np.mean(accs)),
                        "Combined loss:", "{:.2f}".format(g_loss.item()),
                        "MM_ABBA_CYCLE:", "{:.2f}".format(mm_abba.item() if self.cycle_mm else 0),
                        "MM_BAAB_CYCLE:", "{:.2f}".format(mm_baab.item() if self.cycle_mm else 0),
                        "abba acc:", "{:.2f}".format(mae_abba.item() if self.cycle_loss else 0),
                        "baab acc:", "{:.2f}".format(mae_baab.item() if self.cycle_loss else 0),
                        "idloss ab:", "{:.2f}".format(mae_id_abba.item() if self.id_loss else 0),
                        "idloss ba:", "{:.2f}".format(mae_id_baab.item() if self.id_loss else 0),
                        "mm ab loss:", "{:.2f}".format(mm_a_loss if self.one_way_mm else 0),
                        "mm ba loss:", "{:.2f}".format(mm_b_loss if self.one_way_mm else 0),
                        "discriminator cycle loss:", "{:.2f}".format(d_cycle_loss),
                    )
                    scalars = {
                        "epoch": epoch,
                        # "batch": batch_i,
                        "global_step": count,
                        "discriminator_loss": d_loss,
                        # "discriminator_acc": np.mean(accs),
                        "combined_loss": g_loss.item(),
                        "loss": g_loss.item() + d_loss,
                        "MM_ABBA_CYCLE": mm_abba.item() if self.cycle_mm else 0,
                        "MM_BAAB_CYCLE": mm_baab.item() if self.cycle_mm else 0,
                        "abba_mae": mae_abba.item() if self.cycle_loss else 0,
                        "baab_mae": mae_baab.item() if self.cycle_loss else 0,
                        "cycle_da": valid_A_loss.item(),
                        "cycle_db": valid_B_loss.item(),
                        "idloss_ab": mae_id_abba.item() if self.id_loss else 0,
                        "idloss_ba": mae_id_baab.item() if self.id_loss else 0,
                        "mm_ab_loss": mm_a_loss if self.one_way_mm else 0,
                        "mm_ba_loss": mm_b_loss if self.one_way_mm else 0,
                        "discriminator_cycle_loss": d_cycle_loss
                    }
                    if wdb:
                        wandb.log(scalars, step=count)
                    if tb:
                        writer.add_scalars("run", tag_scalar_dict=scalars, global_step=count)
                        writer.flush()

        def train_loop(training_epochs, iters=None):
            count = 0
            # We gave a specific amount of epochs
            if iters is None:
                for epoch in range(training_epochs):
                    for batch_i, (distributional_embeddings, retrofitted_embeddings) in enumerate(dataloader):
                        train_step(self, batch_i, distributional_embeddings, retrofitted_embeddings, epoch, count,
                                   training_epochs)
                        count += 1
                    print("\n")
                    sl, sv, c = self.test(dataset)
                    print(sl, sv, c)
                    print("Saving our results.")
                    # Save to tensorboard
                    if tb:
                        writer.add_scalar("simlex", sl, global_step=count)
                        writer.add_scalar("simverb", sv, global_step=count)
                        writer.add_scalar("card", c, global_step=count)
                        writer.flush()
                    # Save them also to wandb
                    if wdb:
                        wandb.log({"simlex": sl, "card": c, "simverb": sv, "epoch": epoch}, step=count)
                    if ray:
                        tune.report(**{"simlex": sl, "card": c, "simverb": sv, "epoch": epoch})
                    # Save a checkpoint
                    if epochs_per_checkpoint is not None:
                        if epoch % epochs_per_checkpoint == 0 and epoch != 0:
                            self.save_model(name="checkpoint")
                    print("\n")
                    res.append((sl, sv, c))
                    print(res)
                    print("\n")
            else:
                epoch = 0
                running = True
                while running:
                    for batch_i, (distributional_embeddings, retrofitted_embeddings) in enumerate(dataloader):
                        if count >= iters:
                            running = False
                            break
                        train_step(self, batch_i, distributional_embeddings, retrofitted_embeddings, epoch, count,
                                   iters / len(dataloader))
                        count += 1
                    epoch += 1
                    print("\n")
                    sl, sv, c = self.test(dataset)
                    print(sl, sv, c)
                    # Save to tensorboard
                    if tb:
                        writer.add_scalar("simlex", sl, global_step=count)
                        writer.add_scalar("simverb", sv, global_step=count)
                        writer.add_scalar("card", c, global_step=count)
                        writer.flush()
                    # Save to wandb
                    if wdb:
                        wandb.log({"simlex": sl, "simverb": sv, "card": c}, step=count)
                    # Save the checkpoint
                    if epochs_per_checkpoint is not None:
                        if epoch % epochs_per_checkpoint == 0 and epoch != 0:
                            self.save_model(name="checkpoint")
                    print('\n')
                    res.append((sl, sv, c))
                    print(res)
                    print("\n")

        # Start the training loop
        train_loop(epochs, iters=iters)
        print("Final performance")
        sl, sv, c = self.test(dataset)
        print(sl, sv, c)
        res.append((sl, sv, c))
        print('\n')
        return res
  )
  args = args.parse_args()

  # set seed and ensure everything is properly split
  set_seed(args.seed)
  folder_path = f"./transVAE_{args.res}_{args.n_embd}_{args.batch_size}"
  print(f":: Will Save data in {folder_path}")
  os.makedirs(folder_path, exist_ok=True)

  # define the model
  model = TransformerVAE(n_embd=args.n_embd, n_head=args.n_head, res=args.res)
  print(":: Number of params:", sum(p.numel() for p in model.parameters()))

  if WANDB:
    wandb.init(project="vq-vae")
    wandb.watch(model)  # watch the model metrics

  # define the dataset and goooo
  train = DSWrapper(train=True)
  test = DSWrapper(train=False)
  trainer = DiscreteVAETrainer(model, train, test)
  trainer.train(
    bs = args.batch_size,
    lr = args.lr,
    folder_path=folder_path,
    test_every=args.test_every,
    save_every=args.save_every,
    n_epochs=args.n_epochs,
    skip_steps=None,
    gradient_accumulation_steps=args.gradient_accumulation_steps
  )
Exemple #21
0
                              shuffle=False,
                              pin_memory=True)
test_dataloader = DataLoader(test_dataset,
                             batch_size=6,
                             num_workers=8,
                             pin_memory=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = WaveNet()

optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()
featurizer = MelSpectrogram(MelSpectrogramConfig()).to(device)
model.to(device)
wandb.watch(model, log="all")

N_EPOCHS = 14

for epoch in tqdm(range(N_EPOCHS)):
    train_loss, train_acc = train_model(model, train_dataloader, optimizer,
                                        criterion)
    test_loss, test_acc = evaluate(model, test_dataloader, criterion)

    wandb.log({
        "learning_rate": 0.0001,
        "model": 'wavenet',
        "optimizer": 'Adam',
        "train_loss": train_loss,
        "train_accuracy": train_acc,
        "test_loss": test_loss,
def train_transformer_style(model: PyTorchForecast,
                            training_params: Dict,
                            takes_target=False,
                            forward_params: Dict = {},
                            model_filepath: str = "model_save") -> None:
    """Function to train any PyTorchForecast model

    :param model:  A properly wrapped PyTorchForecast model
    :type model: PyTorchForecast
    :param training_params: A dictionary of the necessary parameters for training.
    :type training_params: Dict
    :param takes_target: A parameter to determine whether a model requires the target, defaults to False
    :type takes_target: bool, optional
    :param forward_params: [description], defaults to {}
    :type forward_params: Dict, optional
    :param model_filepath: The file path to load modeel weights from, defaults to "model_save"
    :type model_filepath: str, optional
    :raises ValueError: [description]
    """
    use_wandb = model.wandb
    es = None
    worker_num = 1
    pin_memory = False
    dataset_params = model.params["dataset_params"]
    num_targets = 1
    if "n_targets" in model.params:
        num_targets = model.params["n_targets"]
    if "num_workers" in dataset_params:
        worker_num = dataset_params["num_workers"]
        print("using " + str(worker_num))
    if "pin_memory" in dataset_params:
        pin_memory = dataset_params["pin_memory"]
        print("Pin memory set to true")
    if "early_stopping" in model.params:
        es = EarlyStopper(model.params["early_stopping"]['patience'])
    opt = pytorch_opt_dict[training_params["optimizer"]](
        model.model.parameters(), **training_params["optim_params"])
    criterion_init_params = {}
    if "criterion_params" in training_params:
        criterion_init_params = training_params["criterion_params"]
    criterion = pytorch_criterion_dict[training_params["criterion"]](
        **criterion_init_params)
    if "probabilistic" in model.params[
            "model_params"] or "probabilistic" in model.params:
        probabilistic = True
    else:
        probabilistic = False
    max_epochs = training_params["epochs"]
    data_loader = DataLoader(model.training,
                             batch_size=training_params["batch_size"],
                             shuffle=False,
                             sampler=None,
                             batch_sampler=None,
                             num_workers=worker_num,
                             collate_fn=None,
                             pin_memory=pin_memory,
                             drop_last=False,
                             timeout=0,
                             worker_init_fn=None)
    validation_data_loader = DataLoader(
        model.validation,
        batch_size=training_params["batch_size"],
        shuffle=False,
        sampler=None,
        batch_sampler=None,
        num_workers=worker_num,
        collate_fn=None,
        pin_memory=pin_memory,
        drop_last=False,
        timeout=0,
        worker_init_fn=None)
    # TODO support batch_size > 1
    test_data_loader = DataLoader(model.test_data,
                                  batch_size=1,
                                  shuffle=False,
                                  sampler=None,
                                  batch_sampler=None,
                                  num_workers=worker_num,
                                  collate_fn=None,
                                  pin_memory=pin_memory,
                                  drop_last=False,
                                  timeout=0,
                                  worker_init_fn=None)
    meta_model = None
    meta_representation = None
    meta_loss = None
    if model.params.get("meta_data") is None:
        model.params["meta_data"] = False
    if model.params["meta_data"]:
        meta_model, meta_representation, meta_loss = handle_meta_data(model)
    if use_wandb:
        wandb.watch(model.model)
    session_params = []
    for epoch in range(max_epochs):
        total_loss = torch_single_train(model,
                                        opt,
                                        criterion,
                                        data_loader,
                                        takes_target,
                                        meta_model,
                                        meta_representation,
                                        meta_loss,
                                        multi_targets=num_targets,
                                        forward_params=forward_params.copy())
        print("The loss for epoch " + str(epoch))
        print(total_loss)
        use_decoder = False
        if "use_decoder" in model.params:
            use_decoder = True
        valid = compute_validation(
            validation_data_loader,
            model.model,
            epoch,
            model.params["dataset_params"]["forecast_length"],
            model.crit,
            model.device,
            multi_targets=num_targets,
            meta_model=meta_model,
            decoder_structure=use_decoder,
            use_wandb=use_wandb,
            probabilistic=probabilistic)
        if valid == 0.0:
            raise ValueError(
                "Error validation loss is zero there is a problem with the validator."
            )
        if use_wandb:
            wandb.log({'epoch': epoch, 'loss': total_loss})
        epoch_params = {
            "epoch": epoch,
            "train_loss": str(total_loss),
            "validation_loss": str(valid)
        }
        session_params.append(epoch_params)
        if es:
            if not es.check_loss(model.model, valid):
                print("Stopping model now")
                model.model.load_state_dict(torch.load("checkpoint.pth"))
                break
    decoder_structure = True
    if model.params["dataset_params"]["class"] != "default":
        decoder_structure = False
    test = compute_validation(
        test_data_loader,
        model.model,
        epoch,
        model.params["dataset_params"]["forecast_length"],
        model.crit,
        model.device,
        meta_model=meta_model,
        multi_targets=num_targets,
        decoder_structure=decoder_structure,
        use_wandb=use_wandb,
        val_or_test="test_loss",
        probabilistic=probabilistic)
    print("test loss:", test)
    model.params["run"] = session_params
    model.save_model(model_filepath, max_epochs)
def main():
    ON_SERVER = False

    parser = argparse.ArgumentParser(description='SfSNet - Residual')
    parser.add_argument('--local_rank',
                        type=int,
                        default=0,
                        help='input batch size for training (default: 8)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=8,
                        metavar='N',
                        help='input batch size for training (default: 8)')
    parser.add_argument('--epochs',
                        type=int,
                        default=10,
                        metavar='N',
                        help='number of epochs to train (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.001,
                        metavar='LR',
                        help='learning rate (default: 0.001)')
    parser.add_argument('--wt_decay',
                        type=float,
                        default=0.0005,
                        metavar='W',
                        help='SGD momentum (default: 0.0005)')
    parser.add_argument('--no_cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument('--read_first',
                        type=int,
                        default=-1,
                        help='read first n rows (default: -1)')
    parser.add_argument('--details',
                        type=str,
                        default=None,
                        help='Explaination of the run')
    if ON_SERVER:
        parser.add_argument('--syn_data',
                            type=str,
                            default='/nfs/bigdisk/bsonawane/sfsnet_data/',
                            help='Synthetic Dataset path')
        parser.add_argument(
            '--celeba_data',
            type=str,
            default=
            '/nfs/bigdisk/bsonawane/CelebA-dataset/CelebA_crop_resize_128/',
            help='CelebA Dataset path')
        parser.add_argument('--log_dir',
                            type=str,
                            default='./results/',
                            help='Log Path')
    else:
        parser.add_argument('--syn_data',
                            type=str,
                            default='../data/full_syn/',
                            help='Synthetic Dataset path')
        parser.add_argument('--celeba_data',
                            type=str,
                            default='../data/ffhq_pipeline_test/',
                            help='FFHQ Dataset path')
        parser.add_argument('--log_dir',
                            type=str,
                            default='../results/',
                            help='Log Path')
    parser.add_argument('--load_model',
                        type=str,
                        default=None,
                        help='load model from')
    args = parser.parse_args()
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)

    # initialization
    syn_data = args.syn_data
    celeba_data = args.celeba_data
    batch_size = args.batch_size
    lr = args.lr
    wt_decay = args.wt_decay
    log_dir = args.log_dir
    epochs = args.epochs
    model_dir = args.load_model
    read_first = args.read_first

    if read_first == -1:
        read_first = None

    # Debugging and check working
    # syn_train_csv = syn_data + '/train.csv'
    # train_dataset, _ = get_sfsnet_dataset(syn_dir=syn_data+'train/', read_from_csv=syn_train_csv, read_celeba_csv=None, read_first=read_first, validation_split=5)
    # train_dl  = DataLoader(train_dataset, batch_size=10, shuffle=False)
    # validate_shading_method(train_dl)
    # return

    # Init WandB for logging
    wandb.init(project='SfSNet-CelebA-Baseline-V3-SkipNetBased')
    wandb.log({'lr': lr, 'weight decay': wt_decay})

    # Initialize models
    skipnet_model = SkipNet()
    if use_cuda:
        skipnet_model = skipnet_model.cuda()  # .to(args.local_rank)
    if model_dir is not None:
        skipnet_model.load_state_dict(
            torch.load(model_dir + 'skipnet_model.pkl'))
    else:
        print('Initializing weights')
        skipnet_model.apply(weights_init)

    os.system('mkdir -p {}'.format(args.log_dir))
    with open(args.log_dir + '/details.txt', 'w') as f:
        f.write(args.details)

    wandb.watch(skipnet_model)

    # 1. Train on Synthetic data
    train_synthetic(skipnet_model, syn_data, celeba_data = celeba_data, read_first=read_first, \
            batch_size=batch_size, num_epochs=epochs, log_path=log_dir+'Synthetic_Train/', use_cuda=use_cuda, wandb=wandb, \
            lr=lr, wt_decay=wt_decay, training_syn=True)

    # 2. Generate Pseudo-Training information for CelebA dataset
    # Load CelebA dataset
    celeba_train_csv = celeba_data + '/train.csv'
    celeba_test_csv = celeba_data + '/test.csv'

    train_dataset, _ = get_celeba_dataset(read_from_csv=celeba_train_csv,
                                          read_first=read_first,
                                          validation_split=0)
    test_dataset, _ = get_celeba_dataset(read_from_csv=celeba_test_csv,
                                         read_first=read_first,
                                         validation_split=0)

    celeba_train_dl = DataLoader(train_dataset, batch_size=1, shuffle=True)
    celeba_test_dl = DataLoader(test_dataset, batch_size=1, shuffle=True)

    out_celeba_images_dir = celeba_data + 'synthesized_data_skip_net/'
    out_train_celeba_images_dir = out_celeba_images_dir + 'train/'
    out_test_celeba_images_dir = out_celeba_images_dir + 'test/'

    os.system('mkdir -p {}'.format(out_train_celeba_images_dir))
    os.system('mkdir -p {}'.format(out_test_celeba_images_dir))

    # Dump normal, albedo, shading, face and sh for celeba dataset
    generate_celeba_synthesize(skipnet_model,
                               celeba_train_dl,
                               train_epoch_num=epochs,
                               use_cuda=use_cuda,
                               out_folder=out_train_celeba_images_dir,
                               wandb=wandb)
    generate_celeba_synthesize(skipnet_model,
                               celeba_test_dl,
                               train_epoch_num=epochs,
                               use_cuda=use_cuda,
                               out_folder=out_test_celeba_images_dir,
                               wandb=wandb)

    # generate CSV for images generated above
    generate_celeba_synthesize_data_csv(out_train_celeba_images_dir,
                                        out_celeba_images_dir + '/train.csv')
    generate_celeba_synthesize_data_csv(out_test_celeba_images_dir,
                                        out_celeba_images_dir + '/test.csv')
Exemple #24
0
def train(
    run_name: str,
    # Data
    train_filepath: str = CSNJS_TRAIN_FILEPATH,
    eval_filepath: str = CSNJS_VALID_FILEPATH,
    spm_filepath: str = SPM_UNIGRAM_FILEPATH,
    program_mode="identity",
    eval_program_mode="identity",
    label_mode="identifier",
    num_workers=1,
    limit_dataset_size=-1,
    # Model
    model_type="transformer",
    n_decoder_layers=4,
    d_model: int = 512,
    resume_path: str = "",
    resume_encoder_name: str = "encoder_q",  # encoder_q, encoder_k, encoder
    resume_project: bool = False,
    # Optimization
    train_decoder_only: bool = False,
    num_epochs: int = 50,
    save_every: int = 2,
    batch_size: int = 256,
    lr: float = 8e-4,
    adam_beta1: float = 0.9,
    adam_beta2: float = 0.98,
    use_lr_warmup: bool = True,
    loss_type="nll_token",  # nll_token or nll_sequence
    # Loss
    subword_regularization_alpha: float = 0,
    # Computational
    use_cuda: bool = True,
    auto_test: bool = True,
    seed: int = 0,
):
    """Train model"""
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    run_dir = RUN_DIR / run_name
    run_dir.mkdir(exist_ok=True, parents=True)
    logger.add(str((run_dir / "train.log").resolve()))
    logger.info(f"Saving logs, model checkpoints to {run_dir}")
    config = locals()
    logger.info(f"Config: {config}")
    wandb.init(name=run_name,
               config=config,
               job_type="training",
               project="identifier-prediction",
               entity="ml4code")

    if use_cuda:
        assert torch.cuda.is_available(
        ), "CUDA not available. Check env configuration, or pass --use_cuda False"

    train_augmentations = [
        {
            "fn": "sample_lines",
            "line_length_pct": 0.5,
        },  # WARN: this is a no-op because the arguments for sample_lines are prob and prob_keep_line
        # Also need to have options under an "options" key
        {
            "fn": "insert_var_declaration",
            "prob": 0.5
        },
        {
            "fn": "rename_variable",
            "prob": 0.5
        },
    ]
    sp = spm.SentencePieceProcessor()
    sp.Load(spm_filepath)
    pad_id = sp.PieceToId("[PAD]")

    # Create training dataset and dataloader
    logger.info(f"Training data path {train_filepath}")
    train_dataset = get_csnjs_dataset(train_filepath,
                                      label_mode=label_mode,
                                      limit_size=limit_dataset_size)
    logger.info(f"Training dataset size: {len(train_dataset)}")
    train_loader = javascript_dataloader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers,
        augmentations=train_augmentations,
        sp=sp,
        program_mode=program_mode,
        subword_regularization_alpha=subword_regularization_alpha,
    )

    # Create eval dataset and dataloader
    logger.info(f"Eval data path {eval_filepath}")
    eval_dataset = get_csnjs_dataset(eval_filepath,
                                     label_mode=label_mode,
                                     limit_size=limit_dataset_size)
    logger.info(f"Eval dataset size: {len(eval_dataset)}")
    eval_loader = javascript_dataloader(
        eval_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        augmentations=[],
        sp=sp,
        program_mode=eval_program_mode,
        subword_regularization_alpha=subword_regularization_alpha,
    )

    # Create model
    pad_id = sp.PieceToId("[PAD]")
    if model_type == "transformer":
        model = TransformerModel(n_tokens=sp.GetPieceSize(),
                                 pad_id=pad_id,
                                 n_decoder_layers=n_decoder_layers,
                                 d_model=d_model)
        logger.info(
            f"Created TransformerModel with {count_parameters(model)} params")
    elif model_type == "lstm":
        model = Seq2SeqLSTM(n_tokens=sp.GetPieceSize(),
                            pad_id=pad_id,
                            d_model=d_model)
        logger.info(
            f"Created Seq2SeqLSTM with {count_parameters(model)} params")

    # Set up optimizer
    model = nn.DataParallel(model)
    model = model.cuda() if use_cuda else model
    wandb.watch(model, log="all")
    params = model.module.decoder.parameters(
    ) if train_decoder_only else model.parameters()
    optimizer = torch.optim.Adam(params,
                                 lr=lr,
                                 betas=(adam_beta1, adam_beta2),
                                 eps=1e-9)
    if use_lr_warmup:
        scheduler = get_linear_schedule_with_warmup(
            optimizer, 5000,
            len(train_loader) * num_epochs)
    else:
        scheduler = LambdaLR(optimizer, lr_lambda=lambda x: 1.0)

    # Load checkpoint
    start_epoch = 1
    global_step = 0
    min_eval_loss = float("inf")
    if resume_path:
        logger.info(
            f"Resuming training from checkpoint {resume_path}, resume_encoder_name={resume_encoder_name}"
        )
        checkpoint = torch.load(resume_path)
        assert resume_encoder_name in [
            "encoder_k", "encoder_q", "encoder", "supervised"
        ]

        if resume_encoder_name == "supervised":
            # This checkpoint is the result of training with this script, not pretraining
            model.module.load_state_dict(checkpoint["model_state_dict"])
            optimizer.load_state_dict(checkpoint["optimizer_state_dict"])

            min_eval_loss = checkpoint.get("min_eval_loss",
                                           checkpoint["eval_loss"])

            start_epoch = checkpoint["epoch"] + 1
            global_step = checkpoint["global_step"]

            for _ in range(global_step):
                scheduler.step()
        else:
            pretrained_state_dict = checkpoint["model_state_dict"]
            encoder_state_dict = {}

            for key, value in pretrained_state_dict.items():
                if key.startswith(resume_encoder_name +
                                  ".") and "project_layer" not in key:
                    remapped_key = key[len(resume_encoder_name + "."):]
                    logger.debug(
                        f"Remapping checkpoint key {key} to {remapped_key}. Value mean: {value.mean().item()}"
                    )
                    encoder_state_dict[remapped_key] = value
                if key.startswith(
                        resume_encoder_name +
                        ".") and "project_layer.0." in key and resume_project:
                    remapped_key = key[len(resume_encoder_name + "."):]
                    logger.debug(
                        f"Remapping checkpoint project key {key} to {remapped_key}. Value mean: {value.mean().item()}"
                    )
                    encoder_state_dict[remapped_key] = value
            model.encoder.load_state_dict(encoder_state_dict, strict=False)
            logger.info(f"Loaded keys: {encoder_state_dict.keys()}")
        logger.info(f"Loaded state dict from {resume_path}")

    for epoch in tqdm.trange(start_epoch,
                             num_epochs + 1,
                             desc="training",
                             unit="epoch",
                             leave=False):
        logger.info(f"Starting epoch {epoch}\n")
        if train_decoder_only:
            model.module.encoder.eval()
            model.module.decoder.train()
        else:
            model.train()
        pbar = tqdm.tqdm(train_loader, desc=f"epoch {epoch}")
        for X, Y, X_lengths, Y_lengths in pbar:
            if use_cuda:
                X = X.cuda()
                Y = Y.cuda()
                X_lengths, Y_lengths = X_lengths.cuda(), Y_lengths.cuda()
            optimizer.zero_grad()
            # NOTE: X and Y are [B, max_seq_len] tensors (batch first)
            logits = model(X, Y[:, :-1], X_lengths, Y_lengths)
            if loss_type == "nll_sequence":
                loss = F.cross_entropy(logits.transpose(1, 2),
                                       Y[:, 1:],
                                       ignore_index=pad_id,
                                       reduction="sum")
                loss = loss / X.size(
                    0
                )  # Average over num sequences, not target sequence lengths
                # Thus, minimize bits per sequence.
            elif loss_type == "nll_token":
                loss = F.cross_entropy(
                    logits.transpose(1, 2),
                    Y[:, 1:],
                    ignore_index=pad_id,
                )
            loss.backward()
            optimizer.step()
            scheduler.step()

            # Log loss
            global_step += 1
            wandb.log(
                {
                    "epoch": epoch,
                    f"label-{label_mode}/train_loss": loss.item(),
                    "lr": scheduler.get_last_lr()[0]
                },
                step=global_step,
            )
            pbar.set_description(f"epoch {epoch} loss {loss.item():.4f}")

        # Evaluate
        logger.info(
            f"Evaluating model after epoch {epoch} ({global_step} steps)...")
        max_decode_len = 20 if label_mode == "identifier" else 200
        eval_loss = _evaluate(model,
                              eval_loader,
                              sp,
                              use_cuda=use_cuda,
                              max_decode_len=max_decode_len,
                              loss_type=loss_type)
        logger.info(
            f"Evaluation loss after epoch {epoch} ({global_step} steps): {eval_loss:.4f}"
        )
        wandb.log({
            "epoch": epoch,
            f"label-{label_mode}/eval_loss": eval_loss
        },
                  step=global_step)

        # Save checkpoint
        if save_every and epoch % save_every == 0 or eval_loss < min_eval_loss:
            if eval_loss < min_eval_loss:
                logger.info(
                    f"New best evaluation loss: prev {min_eval_loss:.4f} > new {eval_loss:.4f}"
                )
                min_eval_loss = eval_loss
                model_file = run_dir / "ckpt_best.pth"
            else:
                model_file = run_dir / f"ckpt_ep{epoch:04d}.pth"
            checkpoint = {
                "model_state_dict": model.module.state_dict(),
                "optimizer_state_dict": optimizer.state_dict(),
                "epoch": epoch,
                "global_step": global_step,
                "config": config,
                "eval_loss": eval_loss,
                "min_eval_loss": min_eval_loss,
            }
            logger.info(f"Saving checkpoint to {model_file}...")
            torch.save(checkpoint, str(model_file.resolve()))
            wandb.save(str(model_file.resolve()))
            logger.info("Done.")

    if auto_test:
        best_ckpt = run_dir / "ckpt_best.pth"
        test(
            str(best_ckpt.resolve()),
            CSNJS_TEST_FILEPATH,
            spm_filepath,
            program_mode,
            label_mode,
            num_workers,
            -1,
            n_decoder_layers=n_decoder_layers,
        )
Exemple #25
0
def main():

    global best_bleu4, epochs_since_improvement, start_epoch, data_name, word_map

    if args.fine_tune_encoder and args.fine_tune_epochs == -1:
        raise Exception(
            'if "fine_tune_encoder" == true you must also specify "fine_tune_epochs" != -1'
        )

    # Read word map
    if not args.run_local:
        data_f = '/yoav_stg/gshalev/image_captioning/output_folder'
    else:
        data_f = data_folder

    word_map_file = os.path.join(data_f, 'WORDMAP_' + data_name + '.json')
    print('word_map_file: {}'.format(word_map_file))

    print('loading word map from path: {}'.format(word_map_file))
    with open(word_map_file, 'r') as j:
        word_map = json.load(j)
    print('load word map COMPLETED')

    # rev word map
    rev_word_map = {v: k for k, v in word_map.items()}

    # Initialize checkpoint
    if args.checkpoint is None:
        print('run a new model (No args.checkpoint)')
        decoder = DecoderWithAttention(attention_dim=attention_dim,
                                       embed_dim=emb_dim,
                                       decoder_dim=decoder_dim,
                                       vocab_size=len(word_map),
                                       device=device,
                                       dropout=dropout)

        decoder_optimizer = torch.optim.Adam(params=filter(
            lambda p: p.requires_grad, decoder.parameters()),
                                             lr=decoder_lr)
        encoder = Encoder()
        encoder.fine_tune(True if args.fine_tune_encoder
                          and args.fine_tune_epochs == 0 else False)
        encoder_optimizer = torch.optim.Adam(
            params=filter(lambda p: p.requires_grad, encoder.parameters()),
            lr=encoder_lr
        ) if args.fine_tune_encoder and args.fine_tune_epochs == 0 else None

    # load checkpoint
    else:
        print('run a model loaded from args.checkpoint')
        checkpoint = torch.load(args.checkpoint)
        start_epoch = checkpoint['epoch'] + 1
        epochs_since_improvement = 0
        # epochs_since_improvement = checkpoint['epochs_since_improvement']
        best_bleu4 = checkpoint['bleu-4']
        decoder = checkpoint['decoder']
        decoder_optimizer = checkpoint['decoder_optimizer']
        encoder = checkpoint['encoder']
        encoder_optimizer = checkpoint['encoder_optimizer']

        if args.fine_tune_encoder and encoder_optimizer is None:
            print('----------loading model without encoder optimizer')
            encoder.fine_tune(args.fine_tune_encoder)
            encoder_optimizer = torch.optim.Adam(params=filter(
                lambda p: p.requires_grad, encoder.parameters()),
                                                 lr=encoder_lr)
        elif args.fine_tune_encoder and encoder_optimizer is not None:
            raise Exception('you are loading a model with encoder optimizer')

    # Move to GPU, if available
    decoder = decoder.to(device)
    encoder = encoder.to(device)

    # wandb
    if not args.run_local:
        wandb.watch(decoder)

    # Loss function
    criterion = nn.CrossEntropyLoss().to(device)

    # Custom dataloaders
    train_loader = torch.utils.data.DataLoader(CaptionDataset(
        data_f,
        data_name,
        'TRAIN',
        transform=transforms.Compose([data_normalization])),
                                               batch_size=batch_size,
                                               shuffle=True,
                                               num_workers=workers,
                                               pin_memory=True)

    val_loader = torch.utils.data.DataLoader(CaptionDataset(
        data_f,
        data_name,
        'VAL',
        transform=transforms.Compose([data_normalization])),
                                             batch_size=batch_size,
                                             shuffle=True,
                                             num_workers=workers,
                                             pin_memory=True)

    val_loader_for_val = torch.utils.data.DataLoader(CaptionDataset(
        data_f,
        data_name,
        'VAL',
        transform=transforms.Compose([data_normalization])),
                                                     batch_size=1,
                                                     shuffle=True,
                                                     num_workers=workers,
                                                     pin_memory=True)

    # Epochs
    print('starting epochs')

    for epoch in range(start_epoch, epochs):

        # Decay learning rate if there is no improvement for 8 consecutive epochs, and terminate training after 20
        if epochs_since_improvement == 20:
            print('break after : epochs_since_improvement == 20')
            break

        if epoch == args.fine_tune_epochs:
            print('fine tuning after epoch({}) == args.fine_tune_epochs({})'.
                  format(epoch, args.fine_tune_epochs))
            encoder.fine_tune(args.fine_tune_encoder)
            encoder_optimizer = torch.optim.Adam(params=filter(
                lambda p: p.requires_grad, encoder.parameters()),
                                                 lr=encoder_lr)

            # Chane batch saize to 32
            train_loader = torch.utils.data.DataLoader(CaptionDataset(
                data_f,
                data_name,
                'TRAIN',
                transform=transforms.Compose([data_normalization])),
                                                       batch_size=32,
                                                       shuffle=True,
                                                       num_workers=workers,
                                                       pin_memory=True)

            val_loader = torch.utils.data.DataLoader(CaptionDataset(
                data_f,
                data_name,
                'VAL',
                transform=transforms.Compose([data_normalization])),
                                                     batch_size=32,
                                                     shuffle=True,
                                                     num_workers=workers,
                                                     pin_memory=True)

        if epochs_since_improvement > 0 and epochs_since_improvement % 8 == 0:
            print(
                'adjust lr afetr : epochs_since_improvement > 0 and epochs_since_improvement % 8 == 0'
            )
            adjust_learning_rate(decoder_optimizer, 0.8)

            if args.checkpoint is not None:
                adjust_learning_rate(encoder_optimizer, 0.8)
            elif args.fine_tune_encoder and epoch > args.fine_tune_epochs:
                print(
                    '------------------------------------epoch: {} fine tune lr encoder'
                    .format(epoch))
                adjust_learning_rate(encoder_optimizer, 0.8)

        print(
            '--------------111111111-----------Start train----------epoch-{}'.
            format(epoch))
        # One epoch's training
        train(train_loader=train_loader,
              encoder=encoder,
              decoder=decoder,
              criterion=criterion,
              encoder_optimizer=encoder_optimizer,
              decoder_optimizer=decoder_optimizer,
              epoch=epoch)

        print(
            '--------------2222222222-----------Start validation----------epoch-{}'
            .format(epoch))
        # One epoch's validation
        recent_bleu4 = validate(val_loader=val_loader,
                                encoder=encoder,
                                decoder=decoder,
                                criterion=criterion,
                                rev_word_map=rev_word_map)

        print('9999999999999- recent blue {}'.format(recent_bleu4))
        print(
            '--------------3333333333-----------Start val without teacher forcing----------epoch-{}'
            .format(epoch))
        caption_image_beam_search(encoder, decoder, val_loader_for_val,
                                  word_map, rev_word_map)
        print(
            '!@#!@!#!#@!#@!#@ DONE WITH TRAIN VAL AND VAL WITHOUT TEACHER FORCING FOR EPOCH :{}'
            .format(epoch))

        # Check if there was an improvement
        is_best = recent_bleu4 > best_bleu4
        best_bleu4 = max(recent_bleu4, best_bleu4)
        if not is_best:
            epochs_since_improvement += 1
            print("\nEpochs since last improvement: %d\n" %
                  (epochs_since_improvement, ))
        else:
            epochs_since_improvement = 0

        # Save checkpoint
        save_checkpoint(data_name, epoch, epochs_since_improvement, encoder,
                        decoder, encoder_optimizer, decoder_optimizer,
                        recent_bleu4, is_best, args.runname)
Exemple #26
0
 def wandb_init(self, model):
     if not self._init:
         self._init = True
         wandb.init(project="videowalk", group="release", config=self.args)
         wandb.watch(model)
Exemple #27
0
def training_ms(model, config, train_loader, val_loader):
    wandb.watch(model, log="all")
    train_iter = 0
    best_accuracy = 0
    for epoch in range(config.epochs):
        epoch_loss_rgb = 0
        epoch_loss_ms = 0
        epoch_loss_ordering = 0
        num_corrects_rgb = 0
        num_corrects_ms = 0
        trainSamples = 0
        map_pixel_samples = 0
        iterPerEpoch = 0
        model.lstm_cell.train(True)
        model.classifier.train(True)
        model.resNet.layer4[0].conv1.train(True)
        model.resNet.layer4[0].conv2.train(True)
        model.resNet.layer4[1].conv1.train(True)
        model.resNet.layer4[1].conv2.train(True)
        model.resNet.layer4[2].conv1.train(True)
        model.resNet.layer4[2].conv2.train(True)
        model.resNet.fc.train(True)
        model.ms_conv.train(True)
        model.ms_classifier.train(True)
        # display_ms = True
        for inputs_rgb, map_labels, labels in train_loader:
            num_samples = inputs_rgb.size(0)
            trainSamples += num_samples
            train_iter += 1
            iterPerEpoch += 1
            optimizer_fn.zero_grad()
            inputs_rgb = inputs_rgb.permute(1, 0, 2, 3,
                                            4).to(config.device)  # but why?
            labels = labels.to(config.device)
            # map_labels = map_labels.to(config.device)
            map_labels = map_labels.to(config.device).permute(
                0, 2, 1, 3, 4).squeeze()  # BSxseq_lenx7x7
            output_label, _, output_map, order_labels, order_feats = model(
                inputs_rgb)  # output_map is BSx2
            # if display_ms:
            #     display_map_prediction(map_labels[0].clone(), output_map[0].clone(), loss_fn_ms, config.ms_task)
            #     display_ms = False
            map_pixel_samples += num_samples * config.seq_len * 7 * 7
            loss_rgb = loss_fn_rgb(output_label, labels)
            loss_ms = loss_fn_ms(output_map.squeeze(), map_labels)
            loss_ordering = loss_fn_ordering(order_feats,
                                             order_labels.to(config.device))
            # loss_ratio = loss_rgb.item()/loss_ms.item()
            loss = loss_rgb + loss_ms + loss_ordering
            loss.backward()
            optimizer_fn.step()

            _, predicted_rgb = torch.max(output_label.data, 1)
            epoch_loss_rgb += loss_rgb.item()
            predicted_rgb = predicted_rgb.to(config.device)
            num_corrects_rgb += torch.sum(predicted_rgb == labels).data.item()

            if config.ms_task == "classifier":
                _, predicted_ms = torch.max(output_map.data, 1)
                predicted_ms = predicted_ms.to(config.device)
                num_corrects_ms += torch.sum(
                    predicted_ms == map_labels).data.item()
            epoch_loss_ms += loss_ms.item()
            epoch_loss_ordering += loss_ordering.item()

        optim_scheduler.step()
        avg_loss_rgb = epoch_loss_rgb / iterPerEpoch
        train_accuracy_rgb = (num_corrects_rgb / trainSamples)
        avg_loss_ms = epoch_loss_ms / iterPerEpoch
        avg_loss_ordering = epoch_loss_ordering / iterPerEpoch
        if config.ms_task == "classifier":
            train_accuracy_ms = (num_corrects_ms / map_pixel_samples)

        print('Train: Epoch = {}/{} | Loss = {} | Accuracy = {}'.format(
            epoch + 1, config.epochs, avg_loss_rgb, train_accuracy_rgb))

        max_loss = 6
        avg_loss_normalized_rgb = avg_loss_rgb if avg_loss_rgb < max_loss else max_loss
        avg_loss_normalized_ms = avg_loss_ms if avg_loss_ms < max_loss else max_loss
        if config.ms_task == "classifier":
            wandb.log({
                "train_loss_rgb": avg_loss_normalized_rgb,
                "train_loss_ms": avg_loss_normalized_ms,
                "train_accuracy_rgb": train_accuracy_rgb,
                "train_accuracy_ms": train_accuracy_ms,
                "avg_loss_ordering": avg_loss_ordering,
                "eopch": (epoch + 1)
            })
        else:
            wandb.log({
                "train_loss_rgb": avg_loss_normalized_rgb,
                "train_loss_ms": avg_loss_normalized_ms,
                "train_accuracy_rgb": train_accuracy_rgb,
                "avg_loss_ordering": avg_loss_ordering,
                "eopch": (epoch + 1)
            })

        if (epoch + 1) % config.val_frequency == 0:
            with torch.no_grad():
                model.eval()
                val_loss_epoch_rgb = 0
                val_loss_epoch_ms = 0
                val_iter = 0
                val_samples = 0
                num_corrects_rgb = 0
                num_corrects_ms = 0
                map_pixel_samples = 0
                val_loss_epoch_ordering = 0
                for inputs_rgb, map_labels, labels in val_loader:
                    val_iter += 1
                    num_samples = inputs_rgb.size(0)
                    val_samples += num_samples
                    inputs_rgb = inputs_rgb.permute(1, 0, 2, 3,
                                                    4).to(config.device)
                    labels = labels.to(config.device)
                    map_labels = map_labels.to(config.device).view(
                        num_samples, config.seq_len, 7, 7)
                    output_label, _, output_map, order_labels, order_feats = model(
                        inputs_rgb)
                    map_pixel_samples += num_samples * config.seq_len * 7 * 7
                    val_loss_rgb = loss_fn_rgb(output_label, labels)
                    val_loss_ms = loss_fn_ms(output_map.squeeze(), map_labels)
                    loss_ordering = loss_fn_ordering(
                        order_feats, order_labels.to(config.device))
                    val_loss_epoch_rgb += val_loss_rgb.item()
                    val_loss_epoch_ms += val_loss_ms.item()
                    val_loss_epoch_ordering += loss_ordering.item()
                    _, predicted_rgb = torch.max(output_label.data, 1)
                    num_corrects_rgb += torch.sum(
                        predicted_rgb == labels).data.item()
                    if config.ms_task == "classifier":
                        _, predicted_ms = torch.max(output_map.data, 1)
                        num_corrects_ms += torch.sum(
                            predicted_ms == map_labels).data.item()
            val_accuracy_rgb = (num_corrects_rgb / val_samples)
            avg_val_loss_rgb = val_loss_epoch_rgb / val_iter
            avg_val_loss_ms = val_loss_epoch_ms / val_iter
            avg_val_loss_ordering = val_loss_epoch_ordering / iterPerEpoch

            if config.ms_task == "classifier":
                val_accuracy_ms = (num_corrects_ms / map_pixel_samples)
            print('*****  Val: Epoch = {} | Loss {} | Accuracy = {} *****'.
                  format(epoch + 1, avg_val_loss_rgb, val_accuracy_rgb))

            avg_val_loss_normalized_rgb = avg_val_loss_rgb if avg_val_loss_rgb < max_loss else max_loss
            avg_val_loss_normalized_ms = avg_val_loss_ms if avg_val_loss_ms < max_loss else max_loss
            if config.ms_task == "classifier":
                wandb.log({
                    "valid_loss_rgb": avg_val_loss_normalized_rgb,
                    "valid_loss_ms": avg_val_loss_normalized_ms,
                    "valid_accuracy_rgb": val_accuracy_rgb,
                    "valid_accuracy_ms": val_accuracy_ms,
                    "avg_val_loss_ordering": avg_val_loss_ordering,
                    "eopch": (epoch + 1)
                })
            else:
                wandb.log({
                    "valid_loss_rgb": avg_val_loss_normalized_rgb,
                    "valid_loss_ms": avg_val_loss_normalized_ms,
                    "valid_accuracy_rgb": val_accuracy_rgb,
                    "avg_val_loss_ordering": avg_val_loss_ordering,
                    "eopch": (epoch + 1)
                })

            if val_accuracy_rgb > best_accuracy:
                save_path_model = (config.models_dir +
                                   '/best_model_ms_state_dict.pth')
                torch.save(model.state_dict(), save_path_model)
                best_accuracy = val_accuracy_rgb
                wandb.run.summary["best_valid_accuracy"] = best_accuracy
        else:
            if (epoch + 1) % 10 == 0:
                save_path_model = (config.models_dir +
                                   '/best_model_ms_state_dict' +
                                   str(epoch + 1) + '.pth')
                # torch.save(model.state_dict(), save_path_model)
    wandb.run.summary["best_valid_accuracy"] = best_accuracy
    return
Exemple #28
0
def main():
    global best_prec1, evaluate, args
    wandb.init(project='RC2020-att_sweep', name = args.name, group = args.group)
    
    if args.arch == 0:
        if args.version == 0:
            model = resnet18(att = 'Vanilla', num_classes = 10)
        elif args.version == 1:
            model = resnet18(att = 'GCT', num_classes = 10)
        elif args.version == 2:
            model = resnet18(att = 'Strip Pool', num_classes = 10)
        elif args.version == 3:
            model = resnet18(att = 'ECA', num_classes = 10)
        else:
            model = resnet18(att = 'Triplet', num_classes = 10)
    elif args.arch == 1:
        if args.version == 0:
            model = resnet34(att = 'Vanilla', num_classes = 10)
        elif args.version == 1:
            model = resnet34(att = 'GCT', num_classes = 10)
        elif args.version == 2:
            model = resnet34(att = 'Strip Pool', num_classes = 10)
        elif args.version == 3:
            model = resnet34(att = 'ECA', num_classes = 10)
        else:
            model = resnet34(att = 'Triplet', num_classes = 10)
    else:
        if args.version == 0:
            model = resnet50(att = 'Vanilla', num_classes = 10)
        elif args.version == 1:
            model = resnet50(att = 'GCT', num_classes = 10)
        elif args.version == 2:
            model = resnet50(att = 'Strip Pool', num_classes = 10)
        elif args.version == 3:
            model = resnet50(att = 'ECA', num_classes = 10)
        else:
            model = resnet50(att = 'Triplet', num_classes = 10)
    wandb.watch(model)
    model = model.cuda()

    print('Number of model parameters: {}'.format(
        sum([p.data.nelement() for p in model.parameters()])))
    wandb.config.update({'Parameters':sum([p.data.nelement() for p in model.parameters()]), 'Batch_Size':128})

    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_loader = torch.utils.data.DataLoader(
        datasets.CIFAR10(root='./data', train=True, transform=transforms.Compose([
            transforms.RandomHorizontalFlip(),
            transforms.RandomCrop(32, 4),
            transforms.ToTensor(),
            normalize,
        ]), download=True),
        batch_size=128, shuffle=True,
        num_workers=4, pin_memory=True)

    val_loader = torch.utils.data.DataLoader(
        datasets.CIFAR10(root='./data', train=False, transform=transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ])),
        batch_size=128, shuffle=False,
        num_workers=4, pin_memory=True)

    # define loss function (criterion) and pptimizer
    criterion = nn.CrossEntropyLoss()
    criterion = criterion.cuda()

    optimizer = torch.optim.SGD(model.parameters(), 0.1,
                                momentum=0.9,
                                weight_decay=5e-4)
    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
                                                    milestones=[100, 150], last_epoch=0 - 1)
    max_epoch = 50

    


    for epoch in range(0, max_epoch):
        
        print('current lr {:.5e}'.format(optimizer.param_groups[0]['lr']))
        wandb.log({'lr': optimizer.param_groups[0]['lr']})

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch)

        lr_scheduler.step()

        # evaluate on validation set
        prec1 = validate(val_loader, model, criterion, epoch)

        # remember best prec@1 and save checkpoint
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)

        if epoch > 0 and epoch % 20 == 0:
            save_checkpoint({
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'best_prec1': best_prec1,
            }, is_best, filename=os.path.join('./', 'vanilla_checkpoint.th'))

        save_checkpoint({
            'state_dict': model.state_dict(),
            'best_prec1': best_prec1,
        }, is_best, filename=os.path.join('./', 'vanilla_model.th'))

    wandb.run.finish()
Exemple #29
0
def train_cycle(use_wandb=True):
    print("%s: Training the model" % (time.strftime("%Y/%m/%d-%H:%M:%S")))

    # n_iters = 100000
    # print_every = 5000
    # plot_every = 1000
    print_every = 50
    plot_every = 500
    embedding_size = 32
    num_epochs = 30
    margin = 0.05
    train_size = None
    evaluate_size = 100
    save_path = './unif_model.ckpt'

    # Keep track of losses for plotting
    current_print_loss = 0
    current_plot_loss = 0
    all_losses = []

    start = time.time()
    code_snippets_file = './data/parallel_bodies_n1000'
    descriptions_file = './data/parallel_desc_n1000'
    dataset = CodeDescDataset(code_snippets_file, descriptions_file,
                              train_size)
    num_iters = len(dataset)
    # model = UNIF(dataset.code_vocab_size, dataset.desc_vocab_size, embedding_size)
    model = UNIFNoAttention(dataset.code_vocab_size, dataset.desc_vocab_size,
                            embedding_size)
    cosine_similarity_function = nn.CosineSimilarity()

    loss_function = nn.CosineEmbeddingLoss(margin=margin)
    learning_rate = 0.05  # If you set this too high, it might explode. If too low, it might not learn
    optimiser = torch.optim.SGD(model.parameters(), lr=learning_rate)

    if use_wandb:
        wandb.init(project='code-search', name='unif-cosine-pos', reinit=True)
        config = wandb.config
        config.learning_rate = learning_rate
        config.embedding_size = embedding_size
        config.evaluate_size = evaluate_size
        config.margin = margin
        config.num_epochs = num_epochs
        config.train_size = len(dataset)
        wandb.watch(model, log_freq=plot_every)
        metrics = evaluate_top_n(model, evaluate_size)
        if use_wandb:
            wandb.log(metrics)

    for epoch in range(num_epochs):
        print('Epoch: ', epoch)

        for iter in range(num_iters):
            # print(iter)
            tokenized_code, tokenized_positive_desc, tokenized_negative_desc =\
                dataset[iter]
            code_embedding, desc_embedding, loss = train(
                model, loss_function, optimiser, tokenized_code,
                okenized_positive_desc)
            current_print_loss += loss
            current_plot_loss += loss

            # Print iter number, loss, name and guess
            if (iter + 1) % print_every == 0:
                print('%d %d%% (%s) %.4f' %
                      (iter + 1, (iter + 1) / num_iters * 100,
                       timeSince(start), current_print_loss / print_every))
                cosine_similarity = cosine_similarity_function(
                    code_embedding, desc_embedding).item()
                print('Cosine similarity:', cosine_similarity)
                # print('Cosine similarity:', cosine_similarity, code_embedding, desc_embedding)
                current_print_loss = 0

            # Add current loss avg to list of losses
            if (iter + 1) % plot_every == 0:
                torch.save(model.state_dict(), save_path)
                metrics = evaluate_top_n(model, evaluate_size)
                metrics.update({'loss': current_plot_loss / plot_every})
                all_losses.append(current_plot_loss / plot_every)
                current_plot_loss = 0
                if use_wandb:
                    wandb.log(metrics)

    return model, current_print_loss, all_losses
    def train(
        self,
        train_dataloader,
        output_dir,
        show_running_loss=True,
        eval_dataloader=None,
        verbose=True,
        **kwargs,
    ):
        """
        Trains the model on train_dataset.

        Utility function to be used by the train_model() method. Not intended to be used directly.
        """

        device = self.device
        model = self.model
        args = self.args

        tb_writer = SummaryWriter(logdir=args.tensorboard_dir)

        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

        no_decay = ["bias", "LayerNorm.weight"]

        optimizer_grouped_parameters = []
        custom_parameter_names = set()
        for group in self.args.custom_parameter_groups:
            params = group.pop("params")
            custom_parameter_names.update(params)
            param_group = {**group}
            param_group["params"] = [
                p for n, p in model.named_parameters() if n in params
            ]
            optimizer_grouped_parameters.append(param_group)

        for group in self.args.custom_layer_parameters:
            layer_number = group.pop("layer")
            layer = f"layer.{layer_number}."
            group_d = {**group}
            group_nd = {**group}
            group_nd["weight_decay"] = 0.0
            params_d = []
            params_nd = []
            for n, p in model.named_parameters():
                if n not in custom_parameter_names and layer in n:
                    if any(nd in n for nd in no_decay):
                        params_nd.append(p)
                    else:
                        params_d.append(p)
                    custom_parameter_names.add(n)
            group_d["params"] = params_d
            group_nd["params"] = params_nd

            optimizer_grouped_parameters.append(group_d)
            optimizer_grouped_parameters.append(group_nd)

        if not self.args.train_custom_parameters_only:
            optimizer_grouped_parameters.extend([
                {
                    "params": [
                        p for n, p in model.named_parameters()
                        if n not in custom_parameter_names and not any(
                            nd in n for nd in no_decay)
                    ],
                    "weight_decay":
                    args.weight_decay,
                },
                {
                    "params": [
                        p for n, p in model.named_parameters()
                        if n not in custom_parameter_names and any(
                            nd in n for nd in no_decay)
                    ],
                    "weight_decay":
                    0.0,
                },
            ])

        warmup_steps = math.ceil(t_total * args.warmup_ratio)
        args.warmup_steps = warmup_steps if args.warmup_steps == 0 else args.warmup_steps

        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=args.warmup_steps,
            num_training_steps=t_total)

        if args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        global_step = 0
        tr_loss, logging_loss = 0.0, 0.0
        model.zero_grad()
        train_iterator = trange(int(args.num_train_epochs),
                                desc="Epoch",
                                disable=args.silent)
        epoch_number = 0
        best_eval_metric = None
        early_stopping_counter = 0

        if args.evaluate_during_training:
            training_progress_scores = self._create_training_progress_scores(
                **kwargs)

        if args.wandb_project:
            wandb.init(project=args.wandb_project,
                       config={**asdict(args)},
                       **args.wandb_kwargs)
            wandb.watch(self.model)

        if args.fp16:
            from torch.cuda import amp

            scaler = amp.GradScaler()

        model.train()
        for _ in train_iterator:
            train_iterator.set_description(
                f"Epoch {epoch_number + 1} of {args.num_train_epochs}")
            batch_iterator = tqdm(
                train_dataloader,
                desc=f"Running Epoch {epoch_number} of {args.num_train_epochs}",
                disable=args.silent,
                mininterval=0,
            )
            for step, batch in enumerate(batch_iterator):
                batch = tuple(t.to(device) for t in batch)
                input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch

                if args.fp16:
                    with amp.autocast():
                        (lm_loss), (mc_loss), *_ = model(
                            input_ids,
                            token_type_ids=token_type_ids,
                            mc_token_ids=mc_token_ids,
                            mc_labels=mc_labels,
                            lm_labels=lm_labels,
                        )
                        # model outputs are always tuple in pytorch-transformers (see doc)
                        loss = lm_loss * args.lm_coef + mc_loss * args.mc_coef
                else:
                    (lm_loss), (mc_loss), *_ = model(
                        input_ids,
                        token_type_ids=token_type_ids,
                        mc_token_ids=mc_token_ids,
                        mc_labels=mc_labels,
                        lm_labels=lm_labels,
                    )
                    # model outputs are always tuple in pytorch-transformers (see doc)
                    loss = lm_loss * args.lm_coef + mc_loss * args.mc_coef

                if args.n_gpu > 1:
                    loss = loss.mean(
                    )  # mean() to average on multi-gpu parallel training

                current_loss = loss.item()

                if show_running_loss:
                    print("\rRunning loss: %f" % current_loss, end="")

                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    scaler.scale(loss).backward()
                else:
                    loss.backward()

                tr_loss += loss.item()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        scaler.unscale_(optimizer)
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                    if args.fp16:
                        scaler.step(optimizer)
                        scaler.update()
                    else:
                        optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1

                    if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                        # Log metrics
                        tb_writer.add_scalar("lr",
                                             scheduler.get_lr()[0],
                                             global_step)
                        tb_writer.add_scalar("loss", (tr_loss - logging_loss) /
                                             args.logging_steps, global_step)
                        logging_loss = tr_loss
                        if args.wandb_project:
                            wandb.log({
                                "Training loss": current_loss,
                                "lr": scheduler.get_lr()[0],
                                "global_step": global_step,
                            })

                    if args.save_steps > 0 and global_step % args.save_steps == 0:
                        # Save model checkpoint
                        output_dir_current = os.path.join(
                            output_dir, "checkpoint-{}".format(global_step))

                        self.save_model(output_dir_current, model=model)

                    if args.evaluate_during_training and (
                            args.evaluate_during_training_steps > 0
                            and global_step %
                            args.evaluate_during_training_steps == 0):
                        # Only evaluate when single GPU otherwise metrics may not average well
                        results, _, _ = self.eval_model(
                            eval_dataloader,
                            verbose=verbose
                            and args.evaluate_during_training_verbose,
                            silent=args.evaluate_during_training_silent,
                            **kwargs,
                        )
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value,
                                                 global_step)

                        output_dir_current = os.path.join(
                            output_dir, "checkpoint-{}".format(global_step))

                        if args.save_eval_checkpoints:
                            self.save_model(output_dir_current,
                                            model=model,
                                            results=results)

                        training_progress_scores["global_step"].append(
                            global_step)
                        training_progress_scores["train_loss"].append(
                            current_loss)
                        for key in results:
                            training_progress_scores[key].append(results[key])
                        report = pd.DataFrame(training_progress_scores)
                        report.to_csv(
                            os.path.join(args.output_dir,
                                         "training_progress_scores.csv"),
                            index=False,
                        )

                        if args.wandb_project:
                            wandb.log(
                                self._get_last_metrics(
                                    training_progress_scores))

                        if not best_eval_metric:
                            best_eval_metric = results[
                                args.early_stopping_metric]
                            self.save_model(args.best_model_dir,
                                            model=model,
                                            results=results)
                        if best_eval_metric and args.early_stopping_metric_minimize:
                            if results[
                                    args.
                                    early_stopping_metric] - best_eval_metric < args.early_stopping_delta:
                                best_eval_metric = results[
                                    args.early_stopping_metric]
                                self.save_model(args.best_model_dir,
                                                model=model,
                                                results=results)
                                early_stopping_counter = 0
                            else:
                                if args.use_early_stopping:
                                    if early_stopping_counter < args.early_stopping_patience:
                                        early_stopping_counter += 1
                                        if verbose:
                                            logger.info(
                                                f" No improvement in {args.early_stopping_metric}"
                                            )
                                            logger.info(
                                                f" Current step: {early_stopping_counter}"
                                            )
                                            logger.info(
                                                f" Early stopping patience: {args.early_stopping_patience}"
                                            )
                                    else:
                                        if verbose:
                                            logger.info(
                                                f" Patience of {args.early_stopping_patience} steps reached"
                                            )
                                            logger.info(
                                                " Training terminated.")
                                            train_iterator.close()
                                        return global_step, tr_loss / global_step
                        else:
                            if results[
                                    args.
                                    early_stopping_metric] - best_eval_metric > args.early_stopping_delta:
                                best_eval_metric = results[
                                    args.early_stopping_metric]
                                self.save_model(args.best_model_dir,
                                                model=model,
                                                results=results)
                                early_stopping_counter = 0
                            else:
                                if args.use_early_stopping:
                                    if early_stopping_counter < args.early_stopping_patience:
                                        early_stopping_counter += 1
                                        if verbose:
                                            logger.info(
                                                f" No improvement in {args.early_stopping_metric}"
                                            )
                                            logger.info(
                                                f" Current step: {early_stopping_counter}"
                                            )
                                            logger.info(
                                                f" Early stopping patience: {args.early_stopping_patience}"
                                            )
                                    else:
                                        if verbose:
                                            logger.info(
                                                f" Patience of {args.early_stopping_patience} steps reached"
                                            )
                                            logger.info(
                                                " Training terminated.")
                                            train_iterator.close()
                                        return global_step, tr_loss / global_step

            epoch_number += 1
            output_dir_current = os.path.join(
                output_dir,
                "checkpoint-{}-epoch-{}".format(global_step, epoch_number))

            if args.save_model_every_epoch or args.evaluate_during_training:
                os.makedirs(output_dir_current, exist_ok=True)

            if args.save_model_every_epoch:
                self.save_model(output_dir_current, model=model)

            if args.evaluate_during_training:
                results, _, _ = self.eval_model(
                    eval_dataloader,
                    verbose=verbose and args.evaluate_during_training_verbose,
                    silent=True,
                    **kwargs,
                )

                self.save_model(output_dir_current, results=results)

                training_progress_scores["global_step"].append(global_step)
                training_progress_scores["train_loss"].append(current_loss)
                for key in results:
                    training_progress_scores[key].append(results[key])
                report = pd.DataFrame(training_progress_scores)
                report.to_csv(os.path.join(args.output_dir,
                                           "training_progress_scores.csv"),
                              index=False)

                if args.wandb_project:
                    wandb.log(self._get_last_metrics(training_progress_scores))

                if not best_eval_metric:
                    best_eval_metric = results[args.early_stopping_metric]
                    self.save_model(args.best_model_dir,
                                    model=model,
                                    results=results)
                if best_eval_metric and args.early_stopping_metric_minimize:
                    if results[
                            args.
                            early_stopping_metric] - best_eval_metric < args.early_stopping_delta:
                        best_eval_metric = results[args.early_stopping_metric]
                        self.save_model(args.best_model_dir,
                                        model=model,
                                        results=results)
                        early_stopping_counter = 0
                else:
                    if results[
                            args.
                            early_stopping_metric] - best_eval_metric > args.early_stopping_delta:
                        best_eval_metric = results[args.early_stopping_metric]
                        self.save_model(args.best_model_dir,
                                        model=model,
                                        results=results)
                        early_stopping_counter = 0

        return global_step, tr_loss / global_step