Example #1
0
 def test_write_scalar(self):
     summary_writer = SummaryWriter(self._log_dir)
     tag_name = "learning_rate"
     learning_rate = torch.tensor(.01)
     for i in range(10):
         summary_writer.add_scalar(tag_name, learning_rate, i)
         learning_rate -= 0.005
Example #2
0
    def train(self, load_model=False, model_path=None):
        if load_model:
            if model_path is not None:
                self.load_weights(model_path)
        ## Training utterances
        all_input_ids, all_input_len, all_label_ids = convert_examples_to_features(
            self.train_examples, self.label_list, args.max_seq_length,
            self.tokenizer, args.max_turn_length)

        num_train_batches = all_input_ids.size(0)
        num_train_steps = int(num_train_batches / args.train_batch_size /
                              args.gradient_accumulation_steps *
                              args.num_train_epochs)

        logger.info("***** training *****")
        logger.info("  Num examples = %d", len(self.train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        all_input_ids, all_input_len, all_label_ids = all_input_ids.to(
            DEVICE), all_input_len.to(DEVICE), all_label_ids.to(DEVICE)

        train_data = TensorDataset(all_input_ids, all_input_len, all_label_ids)
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        all_input_ids_dev, all_input_len_dev, all_label_ids_dev = convert_examples_to_features(
            self.dev_examples, self.label_list, args.max_seq_length,
            self.tokenizer, args.max_turn_length)

        logger.info("***** validation *****")
        logger.info("  Num examples = %d", len(self.dev_examples))
        logger.info("  Batch size = %d", args.dev_batch_size)

        all_input_ids_dev, all_input_len_dev, all_label_ids_dev = \
            all_input_ids_dev.to(DEVICE), all_input_len_dev.to(DEVICE), all_label_ids_dev.to(DEVICE)

        dev_data = TensorDataset(all_input_ids_dev, all_input_len_dev,
                                 all_label_ids_dev)
        dev_sampler = SequentialSampler(dev_data)
        dev_dataloader = DataLoader(dev_data,
                                    sampler=dev_sampler,
                                    batch_size=args.dev_batch_size)

        logger.info("Loaded data!")

        if args.fp16:
            self.sumbt_model.half()
        self.sumbt_model.to(DEVICE)

        ## Get domain-slot-type embeddings
        slot_token_ids, slot_len = \
            get_label_embedding(self.processor.target_slot, args.max_label_length, self.tokenizer, DEVICE)

        # for slot_idx, slot_str in zip(slot_token_ids, self.processor.target_slot):
        #     self.idx2slot[slot_idx] = slot_str

        ## Get slot-value embeddings
        label_token_ids, label_len = [], []
        for slot_idx, labels in zip(slot_token_ids, self.label_list):
            # self.idx2value[slot_idx] = {}
            token_ids, lens = get_label_embedding(labels,
                                                  args.max_label_length,
                                                  self.tokenizer, DEVICE)
            label_token_ids.append(token_ids)
            label_len.append(lens)
            # for label, token_id in zip(labels, token_ids):
            #     self.idx2value[slot_idx][token_id] = label

        logger.info('embeddings prepared')

        if USE_CUDA and N_GPU > 1:
            self.sumbt_model.module.initialize_slot_value_lookup(
                label_token_ids, slot_token_ids)
        else:
            self.sumbt_model.initialize_slot_value_lookup(
                label_token_ids, slot_token_ids)

        def get_optimizer_grouped_parameters(model):
            param_optimizer = [(n, p) for n, p in model.named_parameters()
                               if p.requires_grad]
            no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
            optimizer_grouped_parameters = [
                {
                    'params': [
                        p for n, p in param_optimizer
                        if not any(nd in n for nd in no_decay)
                    ],
                    'weight_decay':
                    0.01,
                    'lr':
                    args.learning_rate
                },
                {
                    'params': [
                        p for n, p in param_optimizer
                        if any(nd in n for nd in no_decay)
                    ],
                    'weight_decay':
                    0.0,
                    'lr':
                    args.learning_rate
                },
            ]
            return optimizer_grouped_parameters

        if not USE_CUDA or N_GPU == 1:
            optimizer_grouped_parameters = get_optimizer_grouped_parameters(
                self.sumbt_model)
        else:
            optimizer_grouped_parameters = get_optimizer_grouped_parameters(
                self.sumbt_model.module)

        t_total = num_train_steps

        scheduler = None
        if args.fp16:
            try:
                from apex.optimizers import FP16_Optimizer
                from apex.optimizers import FusedAdam
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            optimizer = FusedAdam(optimizer_grouped_parameters,
                                  lr=args.learning_rate,
                                  bias_correction=False,
                                  max_grad_norm=1.0)
            if args.fp16_loss_scale == 0:
                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
            else:
                optimizer = FP16_Optimizer(
                    optimizer, static_loss_scale=args.fp16_loss_scale)

        else:
            optimizer = AdamW(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              correct_bias=False)
            scheduler = get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=args.warmup_proportion * t_total,
                num_training_steps=t_total)
        logger.info(optimizer)

        # Training code
        ###############################################################################

        logger.info("Training...")

        global_step = 0
        last_update = None
        best_loss = None
        model = self.sumbt_model
        if not args.do_not_use_tensorboard:
            summary_writer = None
        else:
            summary_writer = SummaryWriter("./tensorboard_summary/logs_1214/")

        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
            # Train
            model.train()
            tr_loss = 0
            nb_tr_examples = 0
            nb_tr_steps = 0

            for step, batch in enumerate(tqdm(train_dataloader)):
                batch = tuple(t.to(DEVICE) for t in batch)
                input_ids, input_len, label_ids = batch

                # Forward
                if N_GPU == 1:
                    loss, loss_slot, acc, acc_slot, _ = model(
                        input_ids, input_len, label_ids, N_GPU)
                else:
                    loss, _, acc, acc_slot, _ = model(input_ids, input_len,
                                                      label_ids, N_GPU)

                    # average to multi-gpus
                    loss = loss.mean()
                    acc = acc.mean()
                    acc_slot = acc_slot.mean(0)

                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                # Backward
                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()

                # tensrboard logging
                if summary_writer is not None:
                    summary_writer.add_scalar("Epoch", epoch, global_step)
                    summary_writer.add_scalar("Train/Loss", loss, global_step)
                    summary_writer.add_scalar("Train/JointAcc", acc,
                                              global_step)
                    if N_GPU == 1:
                        for i, slot in enumerate(self.processor.target_slot):
                            summary_writer.add_scalar(
                                "Train/Loss_%s" % slot.replace(' ', '_'),
                                loss_slot[i], global_step)
                            summary_writer.add_scalar(
                                "Train/Acc_%s" % slot.replace(' ', '_'),
                                acc_slot[i], global_step)

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    # modify lealrning rate with special warm up BERT uses
                    lr_this_step = args.learning_rate * warmup_linear(
                        global_step / t_total, args.warmup_proportion)
                    if summary_writer is not None:
                        summary_writer.add_scalar("Train/LearningRate",
                                                  lr_this_step, global_step)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                    if scheduler is not None:
                        torch.nn.utils.clip_grad_norm_(
                            optimizer_grouped_parameters, 1.0)
                    optimizer.step()
                    if scheduler is not None:
                        scheduler.step()
                    optimizer.zero_grad()
                    global_step += 1

            # Perform evaluation on validation dataset
            model.eval()
            dev_loss = 0
            dev_acc = 0
            dev_loss_slot, dev_acc_slot = None, None
            nb_dev_examples, nb_dev_steps = 0, 0

            for step, batch in enumerate(
                    tqdm(dev_dataloader, desc="Validation")):
                batch = tuple(t.to(DEVICE) for t in batch)
                input_ids, input_len, label_ids = batch
                if input_ids.dim() == 2:
                    input_ids = input_ids.unsqueeze(0)
                    input_len = input_len.unsqueeze(0)
                    label_ids = label_ids.unsuqeeze(0)

                with torch.no_grad():
                    if N_GPU == 1:
                        loss, loss_slot, acc, acc_slot, _ = model(
                            input_ids, input_len, label_ids, N_GPU)
                    else:
                        loss, _, acc, acc_slot, _ = model(
                            input_ids, input_len, label_ids, N_GPU)

                        # average to multi-gpus
                        loss = loss.mean()
                        acc = acc.mean()
                        acc_slot = acc_slot.mean(0)

                num_valid_turn = torch.sum(label_ids[:, :, 0].view(-1) > -1,
                                           0).item()
                dev_loss += loss.item() * num_valid_turn
                dev_acc += acc.item() * num_valid_turn

                if N_GPU == 1:
                    if dev_loss_slot is None:
                        dev_loss_slot = [l * num_valid_turn for l in loss_slot]
                        dev_acc_slot = acc_slot * num_valid_turn
                    else:
                        for i, l in enumerate(loss_slot):
                            dev_loss_slot[
                                i] = dev_loss_slot[i] + l * num_valid_turn
                        dev_acc_slot += acc_slot * num_valid_turn

                nb_dev_examples += num_valid_turn

            dev_loss = dev_loss / nb_dev_examples
            dev_acc = dev_acc / nb_dev_examples

            if N_GPU == 1:
                dev_acc_slot = dev_acc_slot / nb_dev_examples

            # tensorboard logging
            if summary_writer is not None:
                summary_writer.add_scalar("Validate/Loss", dev_loss,
                                          global_step)
                summary_writer.add_scalar("Validate/Acc", dev_acc, global_step)
                if N_GPU == 1:
                    for i, slot in enumerate(self.processor.target_slot):
                        summary_writer.add_scalar(
                            "Validate/Loss_%s" % slot.replace(' ', '_'),
                            dev_loss_slot[i] / nb_dev_examples, global_step)
                        summary_writer.add_scalar(
                            "Validate/Acc_%s" % slot.replace(' ', '_'),
                            dev_acc_slot[i], global_step)

            dev_loss = round(dev_loss, 6)

            output_model_file = os.path.join(
                os.path.join(SUMBT_PATH, args.output_dir), "pytorch_model.bin")

            if last_update is None or dev_loss < best_loss:

                if not USE_CUDA or N_GPU == 1:
                    torch.save(model.state_dict(), output_model_file)
                else:
                    torch.save(model.module.state_dict(), output_model_file)

                last_update = epoch
                best_loss = dev_loss
                best_acc = dev_acc

                logger.info(
                    "*** Model Updated: Epoch=%d, Validation Loss=%.6f, Validation Acc=%.6f, global_step=%d ***"
                    % (last_update, best_loss, best_acc, global_step))
            else:
                logger.info(
                    "*** Model NOT Updated: Epoch=%d, Validation Loss=%.6f, Validation Acc=%.6f, global_step=%d  ***"
                    % (epoch, dev_loss, dev_acc, global_step))

            if last_update + args.patience <= epoch:
                break
def train(agents, params, num_processes):
    """Training Loop for value-based RL methods.
    Params
    ======
        agent (object) --- the agent to train
        params (dict) --- the dictionary of parameters
    """
    n_episodes = params['episodes']
    maxlen = params['maxlen']
    name = params['agent_params']['name']
    brain_name = params['brain_name']
    env = params['environment']
    add_noise = params['agent_params']['add_noise']
    pretrain = params['pretrain']
    pretrain_length = params['pretrain_length']
    num_agents = num_processes
    scores = np.zeros(num_agents)  # list containing scores from each episode
    scores_window = deque(maxlen=maxlen)  # last N scores
    scores_episode = []
    writer = SummaryWriter(log_dir=params['log_dir'] + name)

    env_info = env.reset(train_mode=True)[brain_name]
    tic = time.time()
    timesteps = 0
    achievement_length = 0

    episode_start = 1
    if params['load_agent']:
        episode_start, timesteps = agents.load_agent()

    for i_episode in range(episode_start, n_episodes + 1):
        tic = time.time()
        states = env_info.vector_observations
        scores = np.zeros(num_agents)
        env.reset()

        while True:
            states = torch.tensor(states)

            if pretrain and pretrain_length < len(agents.memory.memory):
                pretrain = False

            actions, noise_epsilon = agents.act(states,
                                                add_noise,
                                                pretrain=pretrain)

            env_info = env.step(actions)[
                brain_name]  # send the action to the environment
            next_states = env_info.vector_observations  # get the next state
            rewards = env_info.rewards  # get the reward
            dones = env_info.local_done  # see if episode has finished
            adjusted_rewards = np.array(env_info.rewards)

            if params['hack_rewards']:
                if adjusted_rewards[0] != 0:
                    adjusted_rewards[1] = adjusted_rewards[0] * params[
                        'alternative_reward_scalar']
                elif adjusted_rewards[1] != 0:
                    adjusted_rewards[0] = adjusted_rewards[1] * params[
                        'alternative_reward_scalar']

            actor_loss, critic_loss = agents.step(states,
                                                  actions,
                                                  adjusted_rewards,
                                                  next_states,
                                                  dones,
                                                  pretrain=pretrain)
            if actor_loss != None and critic_loss != None:

                if params['agent_params']['schedule_lr']:
                    actor_lr, critic_lr = agents.get_lr()
                else:
                    actor_lr, critic_lr = params['agent_params'][
                        'actor_params']['lr'], params['agent_params'][
                            'critic_params']['lr']

                writer.add_scalar('noise_epsilon', noise_epsilon, timesteps)
                writer.add_scalar('actor_loss', actor_loss, timesteps)
                writer.add_scalar('critic_loss', critic_loss, timesteps)
                writer.add_scalar('actor_lr', actor_lr, timesteps)
                writer.add_scalar('critic_lr', critic_lr, timesteps)

            print('\rTimestep {}\tMax: {:.2f}'.format(timesteps,
                                                      np.max(scores)),
                  end="")

            scores += rewards  # update the scores
            states = next_states  # roll over the state to next time step
            if np.any(dones):  # exit loop if episode finished
                break

            timesteps += 1

            # Fills the buffer with experiences resulting from random actions
            # to encourage exploration
            if timesteps % params['random_fill_every'] == 0:
                pretrain = True
                pretrain = params['pretrain_length']

        score = np.mean(scores)
        scores_episode.append(score)
        scores_window.append(score)  # save most recent score

        print('\rEpisode {}\tMax: {:.2f} \t Time: {:.2f}'.format(
            i_episode, np.max(scores),
            time.time() - tic),
              end="\n")

        if i_episode % params['save_every'] == 0:
            agents.save_agent(np.mean(scores_window),
                              i_episode,
                              timesteps,
                              save_history=True)
        else:
            agents.save_agent(np.mean(scores_window),
                              i_episode,
                              timesteps,
                              save_history=False)

        writer.add_scalars('scores', {
            'mean': np.mean(scores),
            'min': np.min(scores),
            'max': np.max(scores)
        }, timesteps)

        update_csv(name, i_episode, np.mean(scores), np.mean(scores))

        agents.step_lr(np.mean(scores))

        if np.mean(scores) > params['achievement']:
            achievement_length += 1
            if achievement_length > params['achievement_length']:
                toc = time.time()
                print(
                    "\n\n Congratulations! The agent has managed to solve the environment in {} episodes with {} training time\n\n"
                    .format(i_episode, toc - tic))
                writer.close()
                return scores
        else:
            achievement_length = 0

    writer.close()
    return scores
Example #4
0
sh.rm('-rf', 'logs')

import logging
logging.basicConfig(level=logging.INFO, stream=sys.stdout)

from tensorboardX.writer import SummaryWriter
swriter = SummaryWriter('logs')
add_scalar_old = swriter.add_scalar


def add_scalar_and_log(key, value, global_step=0):
    logging.info('{}:{}: {}'.format(global_step, key, value))
    add_scalar_old(key, value, global_step)


swriter.add_scalar = add_scalar_and_log


def str2bool(x):
    return x.lower() == 'true'


def new_inception_conv2d_forward(self, x):
    x = self.conv(x)
    x = self.bn(x)
    return F.relu(x, inplace=False)


tv.models.inception.BasicConv2d.forward = new_inception_conv2d_forward

import argparse
Example #5
0
class TensorBoard(Callback):
    """Callback that streams epoch results to tensorboard events folder.

Supports all values that can be represented as a string,
including 1D iterables such as `np.ndarray`.

Example:

    ```python
    tensorboard_logger = TensorBoard('runs')
    model.fit(X_train, Y_train, callbacks=[tensorboard_logger])
    ```
  """
    def __init__(self,
                 logdir: Optional[str] = None,
                 update_freq: Union[str, int] = "epoch",
                 **kwargs) -> None:
        """
        Arguments:
            logdir: Save directory location. Default is
                runs/**CURRENT_DATETIME_HOSTNAME**, which changes after each run.
                Use hierarchical folder structure to compare
                between runs easily. e.g. pass in 'runs/exp1', 'runs/exp2', etc.
                for each new experiment to compare across them.
            update_freq: `'batch'` or `'epoch'` or integer. When using `'batch'`,
                writes the losses and metrics to TensorBoard after each batch. The same
                applies for `'epoch'`. 
            **kwargs: Options to pass to `SummaryWriter` object
        """
        self.logdir = logdir
        self.writer = None
        self.keys = None
        if update_freq == "batch":
            self.update_freq = 1
        else:
            self.update_freq = update_freq
        self._open_args = kwargs if kwargs else {}
        super(TensorBoard, self).__init__()

    def on_train_begin(self, logs=None):
        self.writer = SummaryWriter(self.logdir, **self._open_args)

    def on_train_batch_end(self, batch: int, logs):
        if self.update_freq == "epoch":
            return
        logs = logs or {}

        def handle_value(k):
            is_zero_dim_ndarray = isinstance(k, np.ndarray) and k.ndim == 0
            if isinstance(k, six.string_types):
                return k
            elif isinstance(k, tp.Iterable) and not is_zero_dim_ndarray:
                return '"[%s]"' % (", ".join(map(str, k)))
            else:
                return k

        if self.update_freq != "epoch" and batch % self.update_freq == 0:
            if self.keys is None:
                self.keys = sorted(logs.keys())
            row_dict = collections.OrderedDict({"batch": batch})
            row_dict.update(
                (key + "batch", handle_value(logs[key])) for key in self.keys)

            for key, value in row_dict.items():
                self.writer.add_scalar(key, value, batch)

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}

        def handle_value(k):
            is_zero_dim_ndarray = isinstance(k, np.ndarray) and k.ndim == 0
            if isinstance(k, six.string_types):
                return k
            elif isinstance(k, tp.Iterable) and not is_zero_dim_ndarray:
                return '"[%s]"' % (", ".join(map(str, k)))
            else:
                return k

        if self.keys is None:
            self.keys = sorted(logs.keys())

        row_dict = collections.OrderedDict({"epoch": epoch})
        row_dict.update((key, handle_value(logs[key])) for key in self.keys)

        for key, value in row_dict.items():
            self.writer.add_scalar(key, value, epoch)

    def on_train_end(self, logs=None):
        self.writer.close()
Example #6
0
class Trainer:
    experiment_name = None

    def __init__(
        self,
        net,
        criterion=None,
        metric=cal_accuracy,
        train_dataloader=None,
        val_dataloader=None,
        test_dataloader=None,
        optimizer=None,
        lr_scheduler=None,
        tensorboard_dir="./pinkblack_tb/",
        ckpt="./ckpt/ckpt.pth",
        experiment_id=None,
        clip_gradient_norm=False,
        is_data_dict=False,
    ):
        """
        :param net: nn.Module Network
        :param criterion: loss function. __call__(prediction, *batch_y)
        :param metric: metric function __call__(prediction, *batch_y).
                        *note* : bigger is better. (Early Stopping할 때 metric이 더 큰 값을 선택한다)

        :param train_dataloader:
        :param val_dataloader:
        :param test_dataloader:

        :param optimizer: torch.optim
        :param lr_scheduler:
        :param tensorboard_dir: tensorboard log
        :param ckpt: weight path
        :param experiment_id: be shown on tensorboard
        :param clip_gradient_norm: False or Scalar value (숫자를 입력하면 gradient clipping한다.)
        :param is_data_dict: whether dataloaders return dict. 
        (dataloader에서 주는 데이터가 dict인지 - 아니라면 (x, y pair tuple로 주는 데이터이다.)
        """

        self.net = net
        self.criterion = nn.CrossEntropyLoss(
        ) if criterion is None else criterion
        self.metric = metric

        self.dataloader = dict()
        if train_dataloader is not None:
            self.dataloader["train"] = train_dataloader
        if val_dataloader is not None:
            self.dataloader["val"] = val_dataloader
        if test_dataloader is not None:
            self.dataloader["test"] = test_dataloader

        if train_dataloader is None or val_dataloader is None:
            logging.warning("Init Trainer :: Two dataloaders are needed!")

        self.optimizer = (Adam(
            filter(lambda p: p.requires_grad, self.net.parameters()))
                          if optimizer is None else optimizer)
        self.lr_scheduler = lr_scheduler

        self.ckpt = ckpt

        self.config = defaultdict(float)
        self.config["max_train_metric"] = -1e8
        self.config["max_val_metric"] = -1e8
        self.config["max_test_metric"] = -1e8
        self.config["tensorboard_dir"] = tensorboard_dir
        self.config["timestamp"] = datetime.now().strftime("%Y%m%d_%H%M%S")
        self.config["clip_gradient_norm"] = clip_gradient_norm
        self.config["is_data_dict"] = is_data_dict

        if experiment_id is None:
            self.config["experiment_id"] = self.config["timestamp"]
        else:
            self.config["experiment_id"] = experiment_id

        self.dataframe = pd.DataFrame()

        self.device = Trainer.get_model_device(self.net)
        if self.device == torch.device("cpu"):
            logging.warning(
                "Init Trainer :: Do you really want to train the network on CPU instead of GPU?"
            )

        if self.config["tensorboard_dir"] is not None:
            self.tensorboard = SummaryWriter(self.config["tensorboard_dir"])
        else:
            self.tensorboard = None

        self.callbacks = defaultdict(list)

    def register_callback(self, func, phase="val"):
        self.callbacks[phase].append(func)

    def save(self, f=None):
        if f is None:
            f = self.ckpt
        os.makedirs(os.path.dirname(f), exist_ok=True)
        if isinstance(self.net, nn.DataParallel):
            state_dict = self.net.module.state_dict()
        else:
            state_dict = self.net.state_dict()
        torch.save(state_dict, f)
        torch.save(self.optimizer.state_dict(), f + ".optimizer")

        if self.lr_scheduler is not None:
            torch.save(self.lr_scheduler.state_dict(), f + ".scheduler")

        with open(f + ".config", "w") as fp:
            json.dump(self.config, fp)

        self.dataframe.to_csv(f + ".csv", float_format="%.6f", index=False)

    def load(self, f=None):
        if f is None:
            f = self.ckpt

        if isinstance(self.net, nn.DataParallel):
            self.net.module.load_state_dict(
                torch.load(f, map_location=self.device))
        else:
            self.net.load_state_dict(torch.load(f, map_location=self.device))

        if os.path.exists(f + ".config"):
            with open(f + ".config", "r") as fp:
                dic = json.loads(fp.read())
            self.config = defaultdict(float, dic)
            print("Loaded,", self.config)

        if os.path.exists(f + ".optimizer"):
            self.optimizer.load_state_dict(torch.load(f + ".optimizer"))

        if os.path.exists(f + ".scheduler") and self.lr_scheduler is not None:
            self.lr_scheduler.load_state_dict(torch.load(f + ".scheduler"))

        if os.path.exists(f + ".csv"):
            self.dataframe = pd.read_csv(f + ".csv")

        if self.config["tensorboard_dir"] is not None:
            self.tensorboard = SummaryWriter(self.config["tensorboard_dir"])
        else:
            self.tensorboard = None

    def train(self,
              epoch=None,
              phases=None,
              step=None,
              validation_interval=1,
              save_every_validation=False):
        """
        :param epoch: train dataloader를 순회할 횟수
        :param phases: ['train', 'val', 'test'] 중 필요하지 않은 phase를 뺄 수 있다.
        >> trainer.train(1, phases=['val'])

        :param step: epoch이 아닌 step을 훈련단위로 할 때의 총 step 수.
        :param validation_interval: validation 간격
        :param save_every_validation: True이면, validation마다 checkpoint를 저장한다.
        :return: None
        """
        if phases is None:
            phases = list(self.dataloader.keys())

        if epoch is None and step is None:
            raise ValueError(
                "PinkBlack.trainer :: epoch or step should be specified.")

        train_unit = "epoch" if step is None else "step"
        self.config[train_unit] = int(self.config[train_unit])

        num_unit = epoch if step is None else step
        validation_interval = 1 if validation_interval <= 0 else validation_interval

        kwarg_list = [train_unit]
        for phase in phases:
            kwarg_list += [f"{phase}_loss", f"{phase}_metric"]
        kwarg_list += ["lr", "time"]

        print_row(kwarg_list=[""] * len(kwarg_list), pad="-")
        print_row(kwarg_list=kwarg_list, pad=" ")
        print_row(kwarg_list=[""] * len(kwarg_list), pad="-")

        start = self.config[train_unit]

        for i in range(start, start + num_unit, validation_interval):
            start_time = time()
            if train_unit == "epoch":
                for phase in phases:
                    self.config[f"{phase}_loss"], self.config[
                        f"{phase}_metric"] = self._train(
                            phase, num_steps=len(self.dataloader[phase]))
                    for func in self.callbacks[phase]:
                        func()
                self.config[train_unit] += 1
            elif train_unit == "step":
                for phase in phases:
                    if phase == "train":
                        # num_unit 이 validation interval로 나눠떨어지지 않는 경우
                        num_steps = min((start + num_unit - i),
                                        validation_interval)
                        self.config[train_unit] += num_steps
                    else:
                        num_steps = len(self.dataloader[phase])
                    self.config[f"{phase}_loss"], self.config[
                        f"{phase}_metric"] = self._train(phase,
                                                         num_steps=num_steps)
                    for func in self.callbacks[phase]:
                        func()
            else:
                raise NotImplementedError

            if self.lr_scheduler is not None:
                if isinstance(self.lr_scheduler, ReduceLROnPlateau):
                    self.lr_scheduler.step(self.config["val_metric"])
                else:
                    self.lr_scheduler.step()

            i_str = str(self.config[train_unit])
            is_best = self.config["max_val_metric"] < self.config["val_metric"]
            if is_best:
                for phase in phases:
                    self.config[f"max_{phase}_metric"] = max(
                        self.config[f"max_{phase}_metric"],
                        self.config[f"{phase}_metric"])
                i_str = (str(self.config[train_unit])) + "-best"

            elapsed_time = time() - start_time
            if self.tensorboard is not None:
                _loss, _metric = {}, {}
                for phase in phases:
                    _loss[phase] = self.config[f"{phase}_loss"]
                    _metric[phase] = self.config[f"{phase}_metric"]

                self.tensorboard.add_scalars(
                    f"{self.config['experiment_id']}/loss", _loss,
                    self.config[train_unit])
                self.tensorboard.add_scalars(
                    f"{self.config['experiment_id']}/metric", _metric,
                    self.config[train_unit])
                self.tensorboard.add_scalar(
                    f"{self.config['experiment_id']}/time", elapsed_time,
                    self.config[train_unit])
                self.tensorboard.add_scalar(
                    f"{self.config['experiment_id']}/lr",
                    self.optimizer.param_groups[0]["lr"],
                    self.config[train_unit],
                )

            print_kwarg = [i_str]
            for phase in phases:
                print_kwarg += [
                    self.config[f"{phase}_loss"],
                    self.config[f"{phase}_metric"]
                ]
            print_kwarg += [self.optimizer.param_groups[0]["lr"], elapsed_time]

            print_row(kwarg_list=print_kwarg, pad=" ")
            print_row(kwarg_list=[""] * len(kwarg_list), pad="-")
            self.dataframe = self.dataframe.append(dict(
                zip(kwarg_list, print_kwarg)),
                                                   ignore_index=True)

            if is_best:
                self.save(self.ckpt)
                if Trainer.experiment_name is not None:
                    self.update_experiment()

            if save_every_validation:
                self.save(self.ckpt + f"-{self.config[train_unit]}")

    def _step(self, phase, iterator, only_inference=False):

        if self.config["is_data_dict"]:
            batch_dict = next(iterator)
            batch_size = batch_dict[list(batch_dict.keys())[0]].size(0)
            for k, v in batch_dict.items():
                batch_dict[k] = v.to(self.device)
        else:
            batch_x, batch_y = next(iterator)
            if isinstance(batch_x, list):
                batch_x = [x.to(self.device) for x in batch_x]
            else:
                batch_x = [batch_x.to(self.device)]

            if isinstance(batch_y, list):
                batch_y = [y.to(self.device) for y in batch_y]
            else:
                batch_y = [batch_y.to(self.device)]

            batch_size = batch_x[0].size(0)

        self.optimizer.zero_grad()
        with torch.set_grad_enabled(phase == "train"):
            if self.config["is_data_dict"]:
                outputs = self.net(batch_dict)
                if not only_inference:
                    loss = self.criterion(outputs, batch_dict)
            else:
                outputs = self.net(*batch_x)
                if not only_inference:
                    loss = self.criterion(outputs, *batch_y)

            if only_inference:
                return outputs

            if phase == "train":
                loss.backward()
                if self.config["clip_gradient_norm"]:
                    clip_grad_norm_(self.net.parameters(),
                                    self.config["clip_gradient_norm"])
                self.optimizer.step()

        with torch.no_grad():
            if self.config["is_data_dict"]:
                metric = self.metric(outputs, batch_dict)
            else:
                metric = self.metric(outputs, *batch_y)

        return {
            "loss": loss.item(),
            "batch_size": batch_size,
            "metric": metric.item()
        }

    def _train(self, phase, num_steps=0):
        running_loss = AverageMeter()
        running_metric = AverageMeter()

        if phase == "train":
            self.net.train()
        else:
            self.net.eval()

        dataloader = self.dataloader[phase]
        step_iterator = iter(dataloader)
        tq = tqdm(range(num_steps), leave=False)
        for st in tq:
            if (st + 1) % len(dataloader) == 0:
                step_iterator = iter(dataloader)
            results = self._step(phase=phase, iterator=step_iterator)
            tq.set_description(
                f"Loss:{results['loss']:.4f}, Metric:{results['metric']:.4f}")
            running_loss.update(results["loss"], results["batch_size"])
            running_metric.update(results["metric"], results["batch_size"])

        return running_loss.avg, running_metric.avg

    def eval(self, dataloader=None):
        self.net.eval()
        if dataloader is None:
            dataloader = self.dataloader["val"]
            phase = "val"

        output_list = []
        step_iterator = iter(dataloader)
        num_steps = len(dataloader)
        for st in tqdm(range(num_steps), leave=False):
            results = self._step(phase="val",
                                 iterator=step_iterator,
                                 only_inference=True)
            output_list.append(results)

        output_cat = torch.cat(output_list)
        return output_cat

    def add_external_config(self, args):
        """
        args : a dict-like object which contains key-value configurations.
        """
        new_d = defaultdict(float)
        for k, v in args.items():
            new_d[f"config_{k}"] = v
        self.config.update(new_d)

    def update_experiment(self):
        """
        Update experiment statistics by its name (csv file).
        """
        assert Trainer.experiment_name is not None
        df_config = pd.DataFrame(pd.Series(
            self.config)).T.set_index("experiment_id")
        if os.path.exists(Trainer.experiment_name + ".csv"):
            df_ex = pd.read_csv(Trainer.experiment_name + ".csv", index_col=0)
            if self.config["experiment_id"] in df_ex.index:
                df_ex = df_ex.drop(self.config["experiment_id"])
            df_ex = df_ex.append(df_config, sort=False)
        else:
            df_ex = df_config
        df_ex.to_csv(Trainer.experiment_name + ".csv")
        return df_ex

    def swa_apply(self, bn_update=True):
        assert hasattr(self.optimizer, "swap_swa_sgd")
        self.optimizer.swap_swa_sgd()
        if bn_update:
            self.swa_bn_update()

    def swa_bn_update(self):
        r"""Updates BatchNorm running_mean, running_var buffers in the model.
        It performs one pass over data in `loader` to estimate the activation
        statistics for BatchNorm layers in the model.
        original source is from : torchcontrib
        """
        if not check_bn(self.net):
            return
        was_training = self.net.training
        self.net.train()
        momenta = {}
        self.net.apply(reset_bn)
        self.net.apply(lambda module: get_momenta(module, momenta))
        n = 0
        for input in self.dataloader['train']:
            if isinstance(input, (list, tuple)):
                input = input[0]
                b = input.size(0)
                input = input.to(self.device)
            elif self.config['is_data_dict']:
                b = input[list(input.keys())[0]].size(0)
                for k, v in input.items():
                    input[k] = v.to(self.device)
            else:
                b = input.size(0)
                input = input.to(self.device)

            momentum = b / float(n + b)
            for module in momenta.keys():
                module.momentum = momentum

            self.net(input)
            n += b

        self.net.apply(lambda module: set_momenta(module, momenta))
        self.net.train(was_training)

    @staticmethod
    def get_model_device(net):
        device = torch.device("cpu")
        for param in net.parameters():
            device = param.device
            break
        return device

    @staticmethod
    def set_experiment_name(name):
        Trainer.experiment_name = name
Example #7
0
class TensorBoard(Callback):
    """
    Callback that streams epoch results to tensorboard events folder.

    Supports all values that can be represented as a string,
    including 1D iterables such as `np.ndarray`.


    ```python
    tensorboard_logger = TensorBoard('runs')
    model.fit(X_train, Y_train, callbacks=[tensorboard_logger])
    ```
    """

    def __init__(
        self,
        logdir: Optional[str] = None,
        *,
        update_freq: Union[str, int] = "epoch",
        purge_step: Optional[int] = None,
        comment: str = "",
    ) -> None:
        """
        Arguments:
            logdir: Save directory location. Default is
                runs/**CURRENT_DATETIME_HOSTNAME**/{train, val}, which changes after each run.
                Use hierarchical folder structure to compare
                between runs easily. e.g. pass in 'runs/exp1', 'runs/exp2', etc.
                for each new experiment to compare across them.
            update_freq: `'batch'` or `'epoch'` or integer. When using `'batch'`,
                writes the losses and metrics to TensorBoard after each batch. The same
                applies for `'epoch'`. If using an integer, let's say `1000`, the
                callback will write the metrics and losses to TensorBoard every 1000
                batches. Note that writing too frequently to TensorBoard can slow down
                your training.
            purge_step (int):
                When logging crashes at step :math:`T+X` and restarts at step :math:`T`,
                any events whose global_step larger or equal to :math:`T` will be
                purged and hidden from TensorBoard.
                Note that crashed and resumed experiments should have the same ``logdir``.
            comment (string): Comment logdir suffix appended to the default
                ``logdir``. If ``logdir`` is assigned, this argument has no effect.
        """
        if not logdir:
            import socket
            from datetime import datetime

            current_time = datetime.now().strftime("%b%d_%H-%M-%S")
            self.logdir = os.path.join(
                "runs", current_time + "_" + socket.gethostname() + comment
            )
        else:
            self.logdir = logdir
        self.train_writer = None
        self.val_writer = None
        self.keys = None
        self.write_per_batch = True
        try:
            self.update_freq = int(update_freq)
        except ValueError as e:
            self.update_freq = 1
            if update_freq == "batch":
                self.write_per_batch = True
            elif update_freq == "epoch":
                self.write_per_batch = False
            else:
                raise e
        self.purge_step = purge_step

        super(TensorBoard, self).__init__()

    def on_train_begin(self, logs=None):
        self.train_writer = SummaryWriter(
            os.path.join(self.logdir, "train"), purge_step=self.purge_step
        )
        self.val_writer = SummaryWriter(
            os.path.join(self.logdir, "val"), purge_step=self.purge_step
        )
        self.steps = self.params["steps"]
        self.global_step = 0

    def on_train_batch_end(self, batch: int, logs=None):
        if not self.write_per_batch:
            return
        logs = logs or {}
        self.global_step = batch + self.current_epoch * (self.steps)
        if self.global_step % self.update_freq == 0:
            if self.keys is None:
                self.keys = logs.keys()
            for key in self.keys:
                self.train_writer.add_scalar(key, logs[key], self.global_step)

    def on_epoch_begin(self, epoch: int, logs=None):
        self.current_epoch = epoch

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}

        if self.keys is None:
            self.keys = logs.keys()

        # logs on on_{train, test}_batch_end do not have val metrics
        if self.write_per_batch:
            for key in logs:
                if "val" in key:
                    self.val_writer.add_scalar(
                        key.replace("val_", ""), logs[key], self.global_step
                    )
            return

        elif epoch % self.update_freq == 0:

            for key in self.keys:
                if "val" in key:
                    self.val_writer.add_scalar(
                        key.replace("val_", ""), logs[key], epoch
                    )
                else:
                    self.train_writer.add_scalar(key, logs[key], epoch)

    def on_train_end(self, logs=None):
        self.train_writer.close()
        self.val_writer.close()