Example #1
0
class Analyser:
    def __init__(self, **kwargs):
        self.options = kwargs
        LOGS_DIR_TIMESTAMP = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        self.options["logdir"] = os.path.join(
            (self.options["logdir"]
             if "logdir" in self.options else "results"), LOGS_DIR_TIMESTAMP +
            (self.options["comment"] if "comment" in self.options else ""))
        if not os.path.exists(self.options["logdir"]):
            os.makedirs(self.options["logdir"])
        self.writer = SummaryWriter(log_dir=self.options["logdir"],
                                    flush_secs=self.options["flush_secs"]
                                    if "flush_secs" in self.options else 120)
        self.scalars = {}

    def refresh(self):
        self.scalars = {}

    def add_to_scalars(self, **kwargs):
        for k in kwargs.keys():
            self.scalars[k] = (self.scalars[k] +
                               kwargs[k]) if k in self.scalars else kwargs[k]

    def write_scalars(self, epoch):
        for k in self.scalars.keys():
            self.writer.add_scalar(k, self.scalars[k], epoch)

    def save_weights(self, model, epoch):
        torch.save(
            model.state_dict(),
            self.options["logdir"] + "/model_weights_" + str(epoch) + ".pk")

    def update_str(self, epoch, epoch_time=None):
        losses_str = (": {:4.5f}\t\t".join(self.scalars.keys()) +
                      ": {:4.5f}").format(*[x for x in self.scalars.values()])
        epoch_str = "Epoch:{:3d}\t[time={:3.2f}s]\t\t".format(
            epoch, epoch_time
        ) if epoch_time is not None else "Epoch:{:3d}\t\t".format(epoch)
        return epoch_str + losses_str

    def write_scalars_to_file(self, epoch, filename=None):
        filename = "results" if filename is None else filename
        with open(self.writer.get_logdir() + "/" + filename + ".txt",
                  'a') as f:
            print(self.update_str(epoch), file=f)

        try:
            with open(self.writer.get_logdir() + "/" + filename + ".pk",
                      'rb') as f:
                current_dict: dict = pickle.load(f)
            current_dict.update({epoch: self.scalars})
        except:
            current_dict = {epoch: self.scalars}
        finally:
            with open(self.writer.get_logdir() + "/" + filename + ".pk",
                      'wb') as f:
                pickle.dump(current_dict, f)
Example #2
0
def main(opt):
    writer = SummaryWriter()
    log_dir = writer.get_logdir()
    os.makedirs(os.path.join(log_dir, "images"), exist_ok=True)
    os.makedirs(os.path.join(log_dir, "test"), exist_ok=True)

    device = torch.device(
        "cuda") if torch.cuda.is_available() else torch.device("cpu")

    # Initialize generator and discriminator
    generator = UNet(opt.sample_num, opt.channels, opt.batch_size, opt.alpha)
    discriminator = Discriminator(opt.batch_size, opt.alpha)

    generator.to(device=device)
    discriminator.to(device=device)

    # Optimizers
    optimizer_G = torch.optim.Adam(generator.parameters(),
                                   lr=opt.lr_g,
                                   betas=(opt.b1, opt.b2))
    optimizer_D = torch.optim.Adam(discriminator.parameters(),
                                   lr=opt.lr_d,
                                   betas=(opt.b1, opt.b2))

    if opt.mode == 'train':
        generator = train(writer, log_dir, device, generator, discriminator,
                          optimizer_G, optimizer_D, opt)
        test(opt, log_dir, generator=generator)
    if opt.mode == 'test':
        test(opt, log_dir)
        test_moving(opt, log_dir)
Example #3
0
def vedo_data(writer: SummaryWriter,
              image_densities,
              image_samples,
              image_warps,
              epoch,
              image_idx,
              max_number_saved_points=1000):
    logdir = os.path.join(writer.get_logdir(), "vedo_data")
    if not os.path.exists(logdir):
        os.makedirs(logdir)
    if len(image_densities) < max_number_saved_points:
        max_number_saved_points = len(image_densities)
    if image_densities.sum() == 0:
        indices_densities = np.arange(len(image_densities))
    else:
        densities_distribution = image_densities / image_densities.sum()
        indices_densities = np.random.choice(np.arange(len(image_densities)),
                                             max_number_saved_points,
                                             p=densities_distribution)

    image_densities = image_densities[indices_densities]
    samples_densities = image_samples[indices_densities]
    if image_warps is not None:
        warp_magnitude = np.linalg.norm(image_warps, axis=-1)
        if warp_magnitude.sum() == 0:
            indices_warps = np.arange(max_number_saved_points)
        else:
            warp_magnitude_exp = np.exp(10 * warp_magnitude)
            warp_distribution = warp_magnitude_exp / (warp_magnitude_exp.sum(
                axis=-1))
            indices_warps = np.random.choice(np.arange(len(image_warps)),
                                             max_number_saved_points,
                                             p=warp_distribution)
        image_warps = image_warps[indices_warps]

        samples_warps = image_samples[indices_warps]
    else:
        image_warps = []
        samples_warps = []
    np.savez(os.path.join(
        logdir, "densities_samples_warps_epoch_{}_image_{}".format(
            epoch, image_idx)) + '.npz',
             densities=image_densities,
             samples_density=samples_densities,
             samples_warp=samples_warps,
             warps=image_warps)
def main(args):
    utils.init_distributed_mode(args)
    print(args)

    # Opening YAML cfg config file
    with open(args.cfg_file, 'r') as stream:
        try:
            cfg_file = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)

    # Retrieving cfg
    train_cfg = cfg_file['training']
    model_cfg = cfg_file['model']
    data_cfg = cfg_file['dataset']

    # Setting device
    device = torch.device(model_cfg['device'])

    # No possible to set checkpoint and pre-trained model at the same time
    if train_cfg['checkpoint'] and train_cfg['pretrained_model']:
        print("You can't set checkpoint and pretrained-model at the same time")
        exit(1)

    # Creating tensorboard writer
    if train_cfg['checkpoint']:
        checkpoint = torch.load(train_cfg['checkpoint'])
        writer = SummaryWriter(log_dir=checkpoint['tensorboard_working_dir'])
    else:
        writer = SummaryWriter(comment="_" + train_cfg['tensorboard_filename'])

    # Saving cfg file in the same folder
    copyfile(
        args.cfg_file,
        os.path.join(writer.get_logdir(), os.path.basename(args.cfg_file)))

    #######################
    # Creating model
    #######################
    print("Creating model")
    load_custom_model = False
    if train_cfg['checkpoint'] or train_cfg['pretrained_model']:
        load_custom_model = True
    model, backbone = get_model_detection(num_classes=1,
                                          cfg=model_cfg,
                                          load_custom_model=load_custom_model)

    # Putting model to device and setting eval mode
    model.to(device)
    model.train()

    # Freeze the backbone parameters, if needed
    if backbone is not None and model_cfg['freeze_backbone']:
        for param in backbone.parameters():
            param.requires_grad = False
        print('Backbone is freezed!')

    #####################################
    # Creating datasets and dataloaders
    #####################################
    data_root = data_cfg['root']

    ################################
    # Creating training datasets and dataloaders
    print("Loading training data")
    train_datasets_names = data_cfg['train']

    if train_cfg['mixed_batch']:
        assert train_cfg['tgt_images_in_batch'] > 0, \
            "Using mixed training. You need to specify the tgt_images_in_batch parameter!"
        assert len(train_datasets_names) == 2, "Using mixed training, you need to specify two datasets, " \
                                               "the first one as the source while the second as the target"
        source_dataset = CustomYoloAnnotatedDataset(
            data_root, {
                list(train_datasets_names.keys())[0]:
                list(train_datasets_names.values())[0]
            },
            transforms=get_transform(train=True),
            phase='train')
        target_dataset = CustomYoloAnnotatedDataset(
            data_root, {
                list(train_datasets_names.keys())[1]:
                list(train_datasets_names.values())[1]
            },
            transforms=get_transform(train=True),
            phase='train')
        train_dataset = DatasetsEnsemble(source_dataset=source_dataset,
                                         target_dataset=target_dataset)
        train_dataloader = DataLoader(
            train_dataset,
            collate_fn=train_dataset.source_dataset.standard_collate_fn,
            num_workers=train_cfg['num_workers'],
            batch_sampler=EnsembleBatchSampler(
                train_dataset,
                batch_size=train_cfg['batch_size'],
                shuffle=True,
                tgt_imgs_in_batch=train_cfg['tgt_images_in_batch']))
        print(
            'Using mixed training datasets. Source: {}, Target: {}. In every batch, {}/{} are from {}'
            .format(
                list(train_datasets_names.keys())[0],
                list(train_datasets_names.keys())[1],
                train_cfg['tgt_images_in_batch'], train_cfg['batch_size'],
                list(train_datasets_names.keys())[1]))
    else:
        train_dataset = CustomYoloAnnotatedDataset(
            data_root,
            train_datasets_names,
            transforms=get_transform(train=True),
            phase='train')
        train_dataloader = DataLoader(
            train_dataset,
            batch_size=train_cfg['batch_size'],
            shuffle=False,
            num_workers=train_cfg['num_workers'],
            collate_fn=train_dataset.standard_collate_fn)

    ###############################
    # Creating validation datasets
    print("Loading validation data")
    val_datasets_names = data_cfg['val']

    # Creating dataset(s) and dataloader(s)
    val_dataloaders = dict()
    best_validation_ap = defaultdict(float)
    for dataset_name, dataset_cfg in val_datasets_names.items():
        val_dataset = CustomYoloAnnotatedDataset(
            data_root, {dataset_name: dataset_cfg},
            transforms=get_transform(),
            phase="val",
            percentage=train_cfg["percentage_val"])
        val_dataloader = DataLoader(val_dataset,
                                    batch_size=train_cfg['batch_size'],
                                    shuffle=False,
                                    num_workers=train_cfg['num_workers'],
                                    collate_fn=val_dataset.standard_collate_fn)
        # Adding created dataloader
        val_dataloaders[dataset_name] = val_dataloader
        # Initializing best validation ap value
        best_validation_ap[dataset_name] = 0.0

    #######################################
    # Defining optimizer and LR scheduler
    #######################################
    ##########################
    # Constructing an optimizer
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(
        params,
        lr=train_cfg['lr'],
        momentum=train_cfg['momentum'],
        weight_decay=train_cfg['weight_decay'],
    )

    # and a learning rate scheduler
    if model_cfg['coco_model_pretrained']:
        lr_step_size = min(25000, len(train_dataset))
    else:
        lr_step_size = min(40000, 2 * len(train_dataset))
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                   step_size=lr_step_size,
                                                   gamma=train_cfg['lr_gamma'])

    # Defining a warm-up lr scheduler
    warmup_iters = min(1000, len(train_dataloader) - 1)
    warmup_factor = 1. / 1000
    warmup_lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters,
                                                    warmup_factor)

    #############################
    # Resuming a model
    #############################
    start_epoch = 0
    train_step = -1
    # Eventually resuming a pre-trained model
    if train_cfg['pretrained_model']:
        print("Resuming pre-trained model")
        if train_cfg['pretrained_model'].startswith('http://') or train_cfg[
                'pretrained_model'].startswith('https://'):
            pre_trained_model = torch.hub.load_state_dict_from_url(
                train_cfg['pretrained_model'],
                map_location='cpu',
                model_dir=model_cfg["cache_folder"])
        else:
            pre_trained_model = torch.load(train_cfg['pretrained_model'],
                                           map_location='cpu')
        model.load_state_dict(pre_trained_model['model'])

    # Eventually resuming from a saved checkpoint
    if train_cfg['checkpoint']:
        print("Resuming from a checkpoint")
        checkpoint = torch.load(train_cfg['checkpoint'])
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
        warmup_lr_scheduler.load_state_dict(checkpoint['warmup_lr_scheduler'])
        start_epoch = checkpoint['epoch']
        train_step = checkpoint['iteration']
        for elem_name, elem in checkpoint.items():
            if elem_name.startswith("best_"):
                d_name = elem_name.split("_")[1]
                if d_name in best_validation_ap:
                    best_validation_ap[d_name] = elem
                else:
                    warnings.warn(
                        "The dataset {} was not used in the previous training".
                        format(d_name))
                    best_validation_ap[d_name] = 0.0

    ################
    ################
    # Training
    print("Start training")
    for epoch in range(start_epoch, train_cfg['epochs']):
        model.train()
        metric_logger = utils.MetricLogger(delimiter="  ")
        metric_logger.add_meter(
            'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
        header = 'Epoch: [{}]'.format(epoch)

        for images, targets in metric_logger.log_every(
                train_dataloader,
                print_freq=train_cfg['print_freq'],
                header=header):
            train_step += 1
            images = list(image.to(device) for image in images)
            targets = [{k: v.to(device)
                        for k, v in t.items()} for t in targets]

            loss_dict = model(images, targets)

            losses = sum(loss for loss in loss_dict.values())

            # reduce losses over all GPUs for logging purposes
            loss_dict_reduced = utils.reduce_dict(loss_dict)
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())

            loss_value = losses_reduced.item()

            if not math.isfinite(loss_value):
                for target in targets:
                    image_id = target['image_id'].item()
                    print(train_dataset.images[image_id])
                print("Loss is {}, stopping training".format(loss_value))
                print(loss_dict_reduced)
                sys.exit(1)

            optimizer.zero_grad()
            losses.backward()
            # clip norm
            torch.nn.utils.clip_grad_norm_(model.parameters(), 50)
            optimizer.step()

            if epoch == 0 and train_step < warmup_iters:
                warmup_lr_scheduler.step()

            metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
            metric_logger.update(lr=optimizer.param_groups[0]["lr"])

            if train_step % train_cfg['log_loss'] == 0:
                writer.add_scalar('Training/Learning Rate',
                                  optimizer.param_groups[0]["lr"], train_step)
                writer.add_scalar('Training/Reduced Sum Losses',
                                  losses_reduced, train_step)
                writer.add_scalars('Training/All Losses', loss_dict,
                                   train_step)

            if (train_step % train_cfg['save_freq'] == 0 and train_step != 0) \
                    or ((train_cfg['pretrained_model'] or model_cfg['coco_model_pretrained']) and
                        train_step < 6 * train_cfg['save_freq'] and train_step % 200 == 0 and train_step != 0):
                # Validation
                for val_name, val_dataloader in val_dataloaders.items():
                    print("Validation on {}".format(val_name))
                    coco_evaluator = evaluate(
                        model,
                        val_dataloader,
                        device=device,
                        max_dets=model_cfg["max_dets_per_image"])
                    ap = None
                    for iou_type, coco_eval in coco_evaluator.coco_eval.items(
                    ):
                        ap = coco_eval.stats[1]
                    writer.add_scalar(
                        'COCO mAP Validation/{}'.format(val_name), ap,
                        train_step)

                    # Eventually saving best model
                    if ap > best_validation_ap[val_name]:
                        best_validation_ap[val_name] = ap
                        save_checkpoint(
                            {
                                'model':
                                model.state_dict(),
                                'optimizer':
                                optimizer.state_dict(),
                                'lr_scheduler':
                                lr_scheduler.state_dict(),
                                'warmup_lr_scheduler':
                                warmup_lr_scheduler.state_dict()
                                if warmup_lr_scheduler is not None else None,
                                'epoch':
                                epoch,
                                'iteration':
                                train_step,
                                'best_{}_ap'.format(val_name):
                                best_validation_ap[val_name],
                            },
                            writer.get_logdir(),
                            best_model=val_name)

                # Saving last model
                checkpoint_dict = {
                    'model':
                    model.state_dict(),
                    'optimizer':
                    optimizer.state_dict(),
                    'lr_scheduler':
                    lr_scheduler.state_dict(),
                    'warmup_lr_scheduler':
                    warmup_lr_scheduler.state_dict()
                    if warmup_lr_scheduler is not None else None,
                    'epoch':
                    epoch,
                    'iteration':
                    train_step,
                    'tensorboard_working_dir':
                    writer.get_logdir(),
                }
                for d_name, _ in val_dataloaders.items():
                    checkpoint_dict["best_{}_ap".format(
                        d_name)] = best_validation_ap[d_name]
                save_checkpoint(checkpoint_dict, writer.get_logdir())

                # Setting again to train mode
                model.train()

            # Updating lr scheduler
            lr_scheduler.step()
Example #5
0
def q1_1_cnn():
    # prepare tensorboard
    writer = SummaryWriter(comment="-q1.1-CNN")
    output_path = writer.get_logdir()
    net = CNNModel(3, len(CLASS_LABELS))

    # define loss and optim
    criterion = nn.CrossEntropyLoss()

    # get one batch of training data
    train_loader = torch.utils.data.DataLoader(
        dataset_train,
        batch_size=64,
        shuffle=
        False,  # make the first batch of data consistent between executions
        num_workers=2)
    train_data, train_label = next(iter(train_loader))

    # use gpu if available
    if torch.cuda.is_available():
        print("using gpu")
        net = net.cuda()
        criterion = criterion.cuda()
        train_data = train_data.cuda()
        train_label = train_label.cuda()
    optimizer = optim.Adam(net.parameters())

    # training code
    epoch_num = 40
    train_loss = []
    val_loss = []
    for epoch_idx in range(epoch_num):
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(train_data)
        loss = criterion(outputs, train_label)
        loss.backward()
        optimizer.step()

        train_loss.append(loss.cpu().detach().numpy().item())
        writer.add_scalar("Loss/train", loss.cpu().detach(), epoch_idx)

        # validation
        with torch.no_grad():
            tmp = 0
            for i, (val_data, val_label) in enumerate(valid_loader):
                val_data = val_data.to(train_data.device)
                val_label = val_label.to(train_label.device)
                outputs = net(val_data)
                loss = criterion(outputs, val_label)
                tmp += loss.cpu().detach()
            writer.add_scalar("Loss/val", tmp / len(valid_loader), epoch_idx)
            val_loss.append(tmp.numpy().item() / len(valid_loader))
        print(
            f"Train Epoch({epoch_idx + 1} / {epoch_num}), train_loss: {train_loss[-1]}, val_loss: {val_loss[-1]}"
        )

    print("save loss")
    torch.save(val_loss, os.path.join(output_path, "cnn_val_loss"))
    torch.save(train_loss, os.path.join(output_path, "cnn_train_loss"))
    writer.close()
Example #6
0
from torch.utils.tensorboard import SummaryWriter

from sgan.data.loader import data_loader
from sgan.losses import gan_g_loss, gan_d_loss, l2_loss
from sgan.losses import displacement_error, final_displacement_error

from sgan.models import TrajectoryGenerator, TrajectoryDiscriminator
from sgan.utils import int_tuple, bool_flag, get_total_norm
from sgan.utils import relative_to_abs, get_dset_path


torch.backends.cudnn.benchmark = True
writer = SummaryWriter()

time_str="_".join(writer.get_logdir().split("/")[1].split("_")[:2])
# output_dir="/media/felicia/Data/sgan_results/{}".format(time_str)

output_dir="/scratch/sz2257/sgan/sgan_results/{}".format(time_str)
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

# data_dir='/media/felicia/Data/basketball-partial'
data_dir='/scratch/sz2257/sgan/basketball-partial'

parser = argparse.ArgumentParser()
FORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT, stream=sys.stdout)
logger = logging.getLogger(__name__)

# Dataset options
Example #7
0
class Logger(object):
    def __init__(self, log_dir, comment=''):
        self.writer = SummaryWriter(log_dir=log_dir, comment=comment)
        self.imgs_dict = {}

    def scalar_summary(self, tag, value, step):
        self.writer.add_scalar(tag, value, global_step=step)
        self.writer.flush()

    def combined_scalars_summary(self, main_tag, tag_scalar_dict, step):
        self.writer.add_scalars(main_tag, tag_scalar_dict, step)
        self.writer.flush()

    def log(self, tag, text_string, step=0):
        self.writer.add_text(tag, text_string, step)
        self.writer.flush()

    def log_model(self, model, inputs):
        self.writer.add_graph(model, inputs)
        self.writer.flush()

    def get_dir(self):
        return self.writer.get_logdir()

    def log_model_state(self, model, name='tmp'):
        path = os.path.join(self.writer.get_logdir(),
                            type(model).__name__ + '_%s.pt' % name)
        torch.save(model.state_dict(), path)

    def log_video(self,
                  tag,
                  global_step=None,
                  img_tns=None,
                  finished_video=False,
                  video_tns=None,
                  debug=False):
        '''
            Logs video to tensorboard. 
            Video_tns will be empty. If given image tensors, then when finished_video = True, the video of the past tensors will be made into one video. 
            If vide_tns is not empty, then that will be marked the video and the other arguments will be ignored. 
        '''
        if debug:
            import pdb
            pdb.set_trace()
        if img_tns is None and video_tns is None:
            if not finished_video or tag not in self.imgs_dict.keys():
                return None
            lst_img_tns = self.imgs_dict[tag]
            self.writer.add_video(tag,
                                  torch.tensor(lst_img_tns),
                                  global_step=global_step,
                                  fps=4)
            self.writer.flush()
            self.imgs_dict[tag] = []
            return None
        elif video_tns is not None:
            self.writer.add_video(tag,
                                  video_tns,
                                  global_step=global_step,
                                  fps=4)
            self.writer.flush()
            return None

        if tag in self.imgs_dict.keys():
            lst_img_tns = self.imgs_dict[tag]
        else:
            lst_img_tns = []
            self.imgs_dict[tag] = lst_img_tns

        lst_img_tns.append(img_tns)

        if finished_video:
            self.writer.add_video(tag,
                                  torch.tensor(lst_img_tns),
                                  global_step=global_step,
                                  fps=4)
            self.writer.flush()
            self.imgs_dict[tag].clear()

    def close(self):
        self.writer.close()
# from sgan.models import TrajectoryGenerator as GeneratorBaseline, TrajectoryDiscriminator as DiscriminatorBaseline
# from sgan.models_teampos import TrajectoryGenerator as TeamPosGenerator, TrajectoryDiscriminator as TeamPosDiscriminator
from sgan.models_old import TrajectoryGenerator, TrajectoryDiscriminator
# MODELS = {
#     "baseline": (GeneratorBaseline, DiscriminatorBaseline),
#     "team_pos": (TeamPosGenerator, TeamPosDiscriminator)
# }

from sgan.utils import int_tuple, bool_flag, get_total_norm
from sgan.utils import relative_to_abs, get_dset_path

torch.backends.cudnn.benchmark = True
writer = SummaryWriter()

time_str = "_".join(writer.get_logdir().split("/")[1].split("_")[:2])

# output_dir="/media/felicia/Data/sgan_results/{}".format(time_str)
output_dir = "/scratch/sz2257/sgan_results/{}".format(time_str)

# data_dir='/media/felicia/Data/basketball-partial'
data_dir = '/scratch/sz2257/basketball-partial'

parser = argparse.ArgumentParser()
FORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT, stream=sys.stdout)
logger = logging.getLogger(__name__)

# Dataset options
parser.add_argument('--dataset_name',
                    default='01.02.2016.PHX.at.SAC.new',
Example #9
0
class Logger:
    def __init__(self, exp_ID, log_dir):
        """Log the training process of Deepymod.
        Args:
            exp_ID (str): name or ID of the this experiment
            log_dir (str): directory to save the log files to disk.

        """
        self.writer = SummaryWriter(comment=exp_ID,
                                    log_dir=log_dir,
                                    max_queue=5,
                                    flush_secs=10)
        self.log_dir = self.writer.get_logdir()

    def __call__(
        self,
        iteration,
        loss,
        MSE,
        Reg,
        constraint_coeffs,
        unscaled_constraint_coeffs,
        estimator_coeffs,
        **kwargs,
    ):
        l1_norm = torch.sum(torch.abs(torch.cat(constraint_coeffs, dim=1)),
                            dim=0)

        self.update_tensorboard(
            iteration,
            loss,
            MSE,
            Reg,
            l1_norm,
            constraint_coeffs,
            unscaled_constraint_coeffs,
            estimator_coeffs,
            **kwargs,
        )
        self.update_terminal(iteration, MSE, Reg, l1_norm)

    def update_tensorboard(
        self,
        iteration,
        loss,
        loss_mse,
        loss_reg,
        loss_l1,
        constraint_coeff_vectors,
        unscaled_constraint_coeff_vectors,
        estimator_coeff_vectors,
        **kwargs,
    ):
        """Write the current state of training to Tensorboard
        Args:
            iteration (int): iteration number
            loss (float): loss value
            loss_mse (float): loss of the Mean Squared Error term
            loss_reg (float): loss of the regularization term
            loss_l1 (float): loss of the L1 penalty term
            constraint_coeff_vectors (np.array): vector with constraint coefficients
            unscaled_constraint_coeff_vectors (np.array): unscaled vector with constraint coefficients
            estimator_coeff_vectors (np.array): coefficients as computed by the estimator.
        """
        # Costs and coeff vectors
        self.writer.add_scalar("loss/loss", loss, iteration)
        self.writer.add_scalars(
            "loss/mse",
            {f"output_{idx}": val
             for idx, val in enumerate(loss_mse)},
            iteration,
        )
        self.writer.add_scalars(
            "loss/reg",
            {f"output_{idx}": val
             for idx, val in enumerate(loss_reg)},
            iteration,
        )
        self.writer.add_scalars(
            "loss/l1",
            {f"output_{idx}": val
             for idx, val in enumerate(loss_l1)},
            iteration,
        )

        for output_idx, (coeffs, unscaled_coeffs,
                         estimator_coeffs) in enumerate(
                             zip(
                                 constraint_coeff_vectors,
                                 unscaled_constraint_coeff_vectors,
                                 estimator_coeff_vectors,
                             )):
            self.writer.add_scalars(
                f"coeffs/output_{output_idx}",
                {
                    f"coeff_{idx}": val
                    for idx, val in enumerate(coeffs.squeeze())
                },
                iteration,
            )
            self.writer.add_scalars(
                f"unscaled_coeffs/output_{output_idx}",
                {
                    f"coeff_{idx}": val
                    for idx, val in enumerate(unscaled_coeffs.squeeze())
                },
                iteration,
            )
            self.writer.add_scalars(
                f"estimator_coeffs/output_{output_idx}",
                {
                    f"coeff_{idx}": val
                    for idx, val in enumerate(estimator_coeffs.squeeze())
                },
                iteration,
            )

        # Writing remaining kwargs
        for key, value in kwargs.items():
            if value.numel() == 1:
                self.writer.add_scalar(f"remaining/{key}", value, iteration)
            else:
                self.writer.add_scalars(
                    f"remaining/{key}",
                    {
                        f"val_{idx}": val.squeeze()
                        for idx, val in enumerate(value.squeeze())
                    },
                    iteration,
                )

    def update_terminal(self, iteration, MSE, Reg, L1):
        """Prints and updates progress of training cycle in command line."""
        sys.stdout.write(
            f"\r{iteration:>6}  MSE: {torch.sum(MSE).item():>8.2e}  Reg: {torch.sum(Reg).item():>8.2e}  L1: {torch.sum(L1).item():>8.2e} "
        )
        sys.stdout.flush()

    def close(self, model):
        """Close the Tensorboard writer"""
        print("Algorithm converged. Writing model to disk.")
        self.writer.flush()  # flush remaining stuff to disk
        self.writer.close()  # close writer

        # Save model
        model_path = self.log_dir + "model.pt"
        torch.save(model.state_dict(), model_path)
Example #10
0
def train(opt, config, val_fold=0):
    # torch.cuda.set_enabled_lms(True)
    # if (torch.cuda.get_enabled_lms()):
    #     torch.cuda.set_limit_lms(11000 * 1024 * 1024)
    #     print('[LMS=On limit=' + str(torch.cuda.get_limit_lms()) + ']')

    if 'task' not in config['dataset']:
        config['dataset']['task'] = 3 # for back compatibility
        print('Manually assigning: task 3')

    logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
    tb_logger = SummaryWriter(log_dir=opt.logger_name, comment='')
    experiment_path = tb_logger.get_logdir()

    # Dump configuration to experiment path
    copyfile(opt.config, os.path.join(experiment_path, 'config.json'))

    # Load Vocabulary Wrapper

    # Load data loaders
    test_transforms = T.Compose([T.Resize(256),
                    T.CenterCrop(224),
                    T.ToTensor(),
                    T.Normalize(mean=[0.485, 0.456, 0.406],
                                std=[0.229, 0.224, 0.225])])
    train_transforms = T.Compose([T.Resize(256),
                    T.RandomCrop(224),
                    T.ToTensor(),
                    T.Normalize(mean=[0.485, 0.456, 0.406],
                                std=[0.229, 0.224, 0.225])])

    train_dataset = SemEvalDataset(config, split='train', transforms=train_transforms, val_fold=val_fold)
    val_dataset = SemEvalDataset(config, split='val', transforms=test_transforms, val_fold=val_fold)

    id_intersection = set([x['id'] for x in train_dataset.targets]).intersection([x['id'] for x in val_dataset.targets])
    assert len(id_intersection) == 0

    if config['dataset']['task'] == 3:
        classes = read_classes('techniques_list_task3.txt')
    elif config['dataset']['task'] == 1:
        classes = read_classes('techniques_list_task1-2.txt')

    collate_fn = Collate(config, classes)
    if 'balanced-sampling' in config['training'] and config['training']['balanced-sampling']:
        classes_ids = [[train_dataset.class_list.index(x) for x in info['labels']] for info in train_dataset.targets]
        labels = np.zeros((len(classes_ids), len(train_dataset.class_list)))
        for l, c in zip(labels, classes_ids):
            l[c] = 1
        sampler = MultilabelBalancedRandomSampler(labels)
    else:
        sampler = None

    train_dataloader = DataLoader(train_dataset, batch_size=config['training']['bs'], shuffle=True if sampler is None else False, num_workers=opt.workers, collate_fn=collate_fn, sampler=sampler)
    val_dataloader = DataLoader(val_dataset, batch_size=config['training']['bs'], shuffle=False,
                                  num_workers=opt.workers, collate_fn=collate_fn)

    # Construct the model
    model = MemeMultiLabelClassifier(config, labels=classes)
    if torch.cuda.is_available() and not (opt.resume or opt.load_model):
        model.cuda()

    # Construct the optimizer
    if not config['text-model']['fine-tune'] and not config['image-model']['fine-tune']:
        optimizer = torch.optim.Adam([p for n, p in model.named_parameters() if 'textual_module' not in n and 'visual_module' not in n], lr=config['training']['lr'])
    else:
        if config['dataset']['task'] == 3:
            optimizer = torch.optim.Adam([
                {'params': [p for n, p in model.named_parameters() if 'textual_module' not in n and 'visual_module' not in n]},
                {'params': model.textual_module.parameters(), 'lr': config['training']['pretrained-modules-lr']},
                {'params': model.visual_module.parameters(), 'lr': config['training']['pretrained-modules-lr']}]
                , lr=config['training']['lr'])
        elif config['dataset']['task'] == 1:
            optimizer = torch.optim.Adam([
                {'params': [p for n, p in model.named_parameters() if
                            'textual_module' not in n and 'visual_module' not in n]},
                {'params': model.textual_module.parameters(), 'lr': config['training']['pretrained-modules-lr']}]
                , lr=config['training']['lr'])
    # LR scheduler
    scheduler_name = config['training']['scheduler']
    if scheduler_name == 'steplr':
        scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, gamma=config['training']['gamma'], milestones=config['training']['milestones'])
    elif scheduler_name is None:
        scheduler = None
    else:
        raise ValueError('{} scheduler is not available'.format(scheduler_name))


    # # optionally resume from a checkpoint
    start_epoch = 0
    # if opt.resume or opt.load_model:
    #     filename = opt.resume if opt.resume else opt.load_model
    #     if os.path.isfile(filename):
    #         print("=> loading checkpoint '{}'".format(filename))
    #         checkpoint = torch.load(filename, map_location='cpu')
    #         model.load_state_dict(checkpoint['model'], strict=False)
    #         if torch.cuda.is_available():
    #             model.cuda()
    #         if opt.resume:
    #             start_epoch = checkpoint['epoch']
    #             # best_rsum = checkpoint['best_rsum']
    #             optimizer.load_state_dict(checkpoint['optimizer'])
    #             if checkpoint['scheduler'] is not None and not opt.reinitialize_scheduler:
    #                 scheduler.load_state_dict(checkpoint['scheduler'])
    #             # Eiters is used to show logs as the continuation of another
    #             # training
    #             model.Eiters = checkpoint['Eiters']
    #             print("=> loaded checkpoint '{}' (epoch {})"
    #                   .format(opt.resume, start_epoch))
    #         else:
    #             print("=> loaded only model from checkpoint '{}'"
    #                   .format(opt.load_model))
    #     else:
    #         print("=> no checkpoint found at '{}'".format(opt.resume))

    model.train()

    # Train loop
    mean_loss = 0
    progress_bar = tqdm.trange(start_epoch, opt.num_epochs)
    progress_bar.set_description('Train')
    best_f1 = 0.0
    for epoch in progress_bar:
        for it, (image, text, text_len, labels, ids) in enumerate(train_dataloader):
            global_iteration = epoch * len(train_dataloader) + it

            if torch.cuda.is_available():
                image = image.cuda() if image is not None else None
                text = text.cuda()
                labels = labels.cuda()

            # forward the model
            optimizer.zero_grad()

            loss = model(image, text, text_len, labels)
            loss.backward()
            optimizer.step()
            mean_loss += loss.item()

            if global_iteration % opt.log_step == 0:
                mean_loss /= opt.log_step
                progress_bar.set_postfix(dict(loss='{:.2}'.format(mean_loss)))
                mean_loss = 0

            tb_logger.add_scalar("Training/Epoch", epoch, global_iteration)
            tb_logger.add_scalar("Training/Loss", loss.item(), global_iteration)
            tb_logger.add_scalar("Training/Learning_Rate", optimizer.param_groups[0]['lr'], global_iteration)

            if global_iteration % opt.val_step == 0:
                # validate (using different thresholds)
                metrics = validate(val_dataloader, model, classes, thresholds=[0.3, 0.5, 0.8])
                tb_logger.add_scalars("Validation/F1", metrics, global_iteration)
                print(metrics)
                # progress_bar.set_postfix(dict(macroF1='{:.2}'.format(metrics['macroF1_thr=0.5']), microF1='{:.2}'.format(metrics['microF1_thr=0.5'])))

                # save best model
                if metrics['macroF1_thr=0.3'] + metrics['microF1_thr=0.3'] > best_f1:
                    print('Saving best model...')
                    checkpoint = {
                        'cfg': config,
                        'epoch': epoch,
                        'model': model.joint_processing_module.state_dict() if not config['text-model']['fine-tune'] and not config['image-model']['fine-tune'] else model.state_dict()}
                        # 'optimizer': optimizer.state_dict(),
                        # 'scheduler': scheduler.state_dict()}
                    latest = os.path.join(experiment_path, 'model_best_fold{}.pt'.format(val_fold))
                    torch.save(checkpoint, latest)
                    best_f1 = metrics['macroF1_thr=0.3'] + metrics['microF1_thr=0.3']

        scheduler.step()
def main(args):
    utils.init_distributed_mode(args)
    print(args)

    device = torch.device(args.device)

    # Creating tensorboard writer
    if not args.resume:
        writer = SummaryWriter(comment=TENSORBOARD_RESULT_FILE_NAME)
    else:
        writer = SummaryWriter("")

    ######################
    # Creating test data #
    ######################
    print("Loading test data")

    viped_dataset_test = get_dataset("viped",
                                     get_transform(train=False, aug=args.aug),
                                     percentage=5,
                                     val=True)
    mot19_dataset_test = get_dataset("mot19",
                                     get_transform(train=False),
                                     val=True)
    mot17_dataset_test = get_dataset("mot17",
                                     get_transform(train=False),
                                     val=True)
    crowd_human_dataset_test = get_dataset("crowd_human",
                                           get_transform(train=False),
                                           val=True)
    city_persons_dataset_test = get_dataset("city_persons",
                                            get_transform(train=False),
                                            val=True)
    coco_persons_dataset_test = get_dataset("COCO_persons",
                                            get_transform(train=False),
                                            val=True)

    ##########################
    # Creating training data #
    ##########################
    print("Loading training data")
    train_datasets_dict = {
        'viped':
        lambda: get_dataset("viped", get_transform(train=True, aug=args.aug)),
        'mot19':
        lambda: get_dataset("mot19", get_transform(train=True)),
        'mot17':
        lambda: get_dataset("mot17", get_transform(train=True)),
        'crowd_human':
        lambda: get_dataset("crowd_human", get_transform(train=True)),
        'city_persons':
        lambda: get_dataset("city_persons", get_transform(train=True)),
        'COCO_persons:':
        lambda: get_dataset("COCO_persons", get_transform(train=True)),
    }

    #################################
    # Preparing training dataloader #
    #################################
    if args.train_on in train_datasets_dict:
        # the train dataset is a normal single dataset
        train_dataset = train_datasets_dict[args.train_on]()
        train_dataloader = DataLoader(
            train_dataset,
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=args.workers,
            collate_fn=train_dataset.standard_collate_fn)
        print('Using training dataset: {}'.format(args.train_on))
    elif ',' in args.train_on:
        assert args.tgt_images_in_batch > 0, "Using mixed training. " \
                                             "You need to specify the args.tgt_images_in_batch parameter!"
        # the train dataset is an ensamble of datasets
        source_dataset_name, target_dataset_name = args.train_on.split(',')
        train_dataset = DatasetsEnsemble(
            train_datasets_dict[source_dataset_name](),
            train_datasets_dict[target_dataset_name]())
        train_dataloader = DataLoader(
            train_dataset,
            collate_fn=train_dataset.source_dataset.standard_collate_fn,
            num_workers=args.workers,
            batch_sampler=EnsembleBatchSampler(
                train_dataset,
                batch_size=args.batch_size,
                shuffle=True,
                tgt_imgs_in_batch=args.tgt_images_in_batch))
        print(
            'Using mixed training datasets. Source: {}, Target: {}. In every batch, {}/{} are from {}'
            .format(source_dataset_name, target_dataset_name,
                    args.tgt_images_in_batch, args.batch_size,
                    target_dataset_name))
    else:
        raise ValueError('Dataset not known!')

    ##############################
    # Preparing test dataloaders #
    ##############################

    data_loader_viped_test = DataLoader(
        viped_dataset_test,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.workers,
        collate_fn=viped_dataset_test.standard_collate_fn)

    data_loader_mot19_test = DataLoader(
        mot19_dataset_test,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.workers,
        collate_fn=mot19_dataset_test.standard_collate_fn)

    data_loader_mot17_test = DataLoader(
        mot17_dataset_test,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.workers,
        collate_fn=mot17_dataset_test.standard_collate_fn)

    data_loader_crowd_human_test = DataLoader(
        crowd_human_dataset_test,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.workers,
        collate_fn=crowd_human_dataset_test.standard_collate_fn)

    data_loader_city_persons_test = DataLoader(
        city_persons_dataset_test,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.workers,
        collate_fn=city_persons_dataset_test.standard_collate_fn)

    data_loader_coco_persons_test = DataLoader(
        coco_persons_dataset_test,
        shuffle=False,
        num_workers=args.workers,
        collate_fn=coco_persons_dataset_test.standard_collate_fn)

    # Creating model
    print("Creating model")
    model, backbone = get_model_detection(num_classes=1,
                                          model=args.model,
                                          pretrained=args.pretrained)

    # Putting model to device and setting eval mode
    model.to(device)
    model.train()

    # freeze the backbone parameters, if needed
    if backbone is not None and args.freeze_backbone:
        for param in backbone.parameters():
            param.requires_grad = False
        print('Backbone is freezed!')

    # construct an optimizer
    params = [p for p in model.parameters() if p.requires_grad]
    if args.optimizer == "sgd":
        optimizer = torch.optim.SGD(params,
                                    lr=args.lr,
                                    momentum=args.momentum,
                                    weight_decay=args.weight_decay)
    elif args.optimizer == "adam":
        optimizer = torch.optim.Adam(
            params=params,
            lr=args.lr,
        )
    else:
        print("Optimizer not available")
        exit(1)

    # and a learning rate scheduler
    if args.lr_scheduler == "step_lr":
        lr_scheduler = torch.optim.lr_scheduler.StepLR(
            optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma)
    elif args.lr_scheduler == "plateau":
        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, mode='max', patience=args.lr_patience, verbose=True)
    else:
        print("L-Scheduler not available")
        exit(1)

    # Defining a warm-uo lr scheduler
    warmup_iters = min(1000, len(train_dataloader) - 1)
    warmup_factor = 1. / 1000
    warmup_lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters,
                                                    warmup_factor)

    # Loading checkpoint
    start_epoch = 0
    train_step = -1
    best_viped_ap, best_mot19_ap, best_mot17_ap, best_crowdhuman_ap, best_citypersons_ap, best_cocopersons_ap \
        = 0, 0, 0, 0, 0, 0
    if args.resume:
        print("Resuming from checkpoint")
        checkpoint = torch.load(args.resume)
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
        warmup_lr_scheduler.load_state_dict(checkpoint['warmup_lr_scheduler'])
        start_epoch = checkpoint['epoch']
        train_step = checkpoint['iteration']
        best_viped_ap = checkpoint['best_viped_ap']
        best_mot19_ap = checkpoint['best_mot19_ap']
        best_mot17_ap = checkpoint['best_mot17_ap']
        best_crowdhuman_ap = checkpoint['best_crowdhuman_ap']
        best_citypersons_ap = checkpoint['best_citypersons_ap']
        best_cocopersons_ap = checkpoint['best_cocopersons_ap']

    # Cross-check if the backbone has been really freezed
    if backbone is not None and args.freeze_backbone:
        for param in backbone.parameters():
            assert not param.requires_grad, "Backbone seems to be not freezed correctly!"

    # Train
    print("Start training")
    for epoch in range(start_epoch, args.epochs):
        model.train()
        metric_logger = utils.MetricLogger(delimiter="  ")
        metric_logger.add_meter(
            'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
        header = 'Epoch: [{}]'.format(epoch)

        for images, targets in metric_logger.log_every(
                train_dataloader, print_freq=args.print_freq, header=header):
            train_step += 1
            images = list(image.to(device) for image in images)
            targets = [{k: v.to(device)
                        for k, v in t.items()} for t in targets]

            loss_dict = model(images, targets)

            losses = sum(loss for loss in loss_dict.values())

            # reduce losses over all GPUs for logging purposes
            loss_dict_reduced = utils.reduce_dict(loss_dict)
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())

            loss_value = losses_reduced.item()

            if not math.isfinite(loss_value):
                print("Loss is {}, stopping training".format(loss_value))
                print(loss_dict_reduced)
                sys.exit(1)

            optimizer.zero_grad()
            losses.backward()
            # clip norm
            torch.nn.utils.clip_grad_norm(model.parameters(), 50)
            optimizer.step()

            if epoch == 0 and train_step < warmup_iters:
                warmup_lr_scheduler.step()

            metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
            metric_logger.update(lr=optimizer.param_groups[0]["lr"])

            if train_step % args.log_loss == 0:
                writer.add_scalar('Training/Learning Rate',
                                  optimizer.param_groups[0]["lr"], train_step)
                writer.add_scalar('Training/Reduced Sum Losses',
                                  losses_reduced, train_step)
                writer.add_scalars('Training/All Losses', loss_dict,
                                   train_step)

            if (train_step % args.save_freq == 0 and train_step != 0) or \
               (args.pretrained and train_step < 5*args.save_freq and train_step % 200 == 0 and train_step != 0) \
                    or train_step == 100:
                # evaluate on the test datasets
                print("Validation viped Dataset")
                viped_coco_evaluator = evaluate(model,
                                                data_loader_viped_test,
                                                device=device,
                                                max_dets=args.max_dets)
                print("Validation mot19 Dataset")
                mot19_coco_evaluator = evaluate(model,
                                                data_loader_mot19_test,
                                                device=device,
                                                max_dets=args.max_dets)
                print("Validation mot17 Dataset")
                mot17_coco_evaluator = evaluate(model,
                                                data_loader_mot17_test,
                                                device=device,
                                                max_dets=args.max_dets)
                print("Validation crowdhuman Dataset")
                crowdhuman_coco_evaluator = evaluate(
                    model,
                    data_loader_crowd_human_test,
                    device=device,
                    max_dets=args.max_dets)
                print("Validation citypersons Dataset")
                citypersons_coco_evaluator = evaluate(
                    model,
                    data_loader_city_persons_test,
                    device=device,
                    max_dets=args.max_dets)
                print("Validation COCO Persons Dataset")
                cocopersons_coco_evaluator = evaluate(
                    model,
                    data_loader_coco_persons_test,
                    device=device,
                    max_dets=args.max_dets)

                # save using tensorboard
                viped_ap, mot19_ap, mot17_ap, crowdhuman_ap, citypersons_ap, cocopersons_ap = \
                    None, None, None, None, None, None
                for iou_type, coco_eval in viped_coco_evaluator.coco_eval.items(
                ):
                    viped_ap = coco_eval.stats[1]
                for iou_type, coco_eval in mot19_coco_evaluator.coco_eval.items(
                ):
                    mot19_ap = coco_eval.stats[1]
                for iou_type, coco_eval in mot17_coco_evaluator.coco_eval.items(
                ):
                    mot17_ap = coco_eval.stats[1]
                for iou_type, coco_eval in crowdhuman_coco_evaluator.coco_eval.items(
                ):
                    crowdhuman_ap = coco_eval.stats[1]
                for iou_type, coco_eval in citypersons_coco_evaluator.coco_eval.items(
                ):
                    citypersons_ap = coco_eval.stats[1]
                for iou_type, coco_eval in cocopersons_coco_evaluator.coco_eval.items(
                ):
                    cocopersons_ap = coco_eval.stats[1]
                writer.add_scalar('COCO mAP Validation/ViPeD', viped_ap,
                                  train_step)
                writer.add_scalar('COCO mAP Validation/MOT19', mot19_ap,
                                  train_step)
                writer.add_scalar('COCO mAP Validation/MOT17', mot17_ap,
                                  train_step)
                writer.add_scalar('COCO mAP Validation/CrowdHuman',
                                  crowdhuman_ap, train_step)
                writer.add_scalar('COCO mAP Validation/CityPersons',
                                  citypersons_ap, train_step)
                writer.add_scalar('COCO mAP Validation/COCOPersons',
                                  cocopersons_ap, train_step)

                # Eventually saving best models
                if viped_ap > best_viped_ap:
                    best_viped_ap = viped_ap
                    save_checkpoint(
                        {
                            'model':
                            model.state_dict(),
                            'optimizer':
                            optimizer.state_dict(),
                            'lr_scheduler':
                            lr_scheduler.state_dict(),
                            'warmup_lr_scheduler':
                            warmup_lr_scheduler.state_dict()
                            if warmup_lr_scheduler is not None else None,
                            'epoch':
                            epoch,
                            'iteration':
                            train_step,
                            'best_viped_ap':
                            best_viped_ap,
                            'best_mot19_ap':
                            best_mot19_ap,
                            'best_mot17_ap':
                            best_mot17_ap,
                            'best_crowdhuman_ap':
                            best_crowdhuman_ap,
                            'best_citypersons_ap':
                            best_citypersons_ap,
                            'best_cocopersons_ap':
                            best_cocopersons_ap,
                        },
                        writer.get_logdir(),
                        best_model="viped")
                if mot19_ap > best_mot19_ap:
                    best_mot19_ap = mot19_ap
                    save_checkpoint(
                        {
                            'model':
                            model.state_dict(),
                            'optimizer':
                            optimizer.state_dict(),
                            'lr_scheduler':
                            lr_scheduler.state_dict(),
                            'warmup_lr_scheduler':
                            warmup_lr_scheduler.state_dict()
                            if warmup_lr_scheduler is not None else None,
                            'epoch':
                            epoch,
                            'iteration':
                            train_step,
                            'best_viped_ap':
                            best_viped_ap,
                            'best_mot19_ap':
                            best_mot19_ap,
                            'best_mot17_ap':
                            best_mot17_ap,
                            'best_crowdhuman_ap':
                            best_crowdhuman_ap,
                            'best_citypersons_ap':
                            best_citypersons_ap,
                            'best_cocopersons_ap':
                            best_cocopersons_ap,
                        },
                        writer.get_logdir(),
                        best_model="mot19")
                if mot17_ap > best_mot17_ap:
                    best_mot17_ap = mot17_ap
                    save_checkpoint(
                        {
                            'model':
                            model.state_dict(),
                            'optimizer':
                            optimizer.state_dict(),
                            'lr_scheduler':
                            lr_scheduler.state_dict(),
                            'warmup_lr_scheduler':
                            warmup_lr_scheduler.state_dict()
                            if warmup_lr_scheduler is not None else None,
                            'epoch':
                            epoch,
                            'iteration':
                            train_step,
                            'best_viped_ap':
                            best_viped_ap,
                            'best_mot19_ap':
                            best_mot19_ap,
                            'best_mot17_ap':
                            best_mot17_ap,
                            'best_crowdhuman_ap':
                            best_crowdhuman_ap,
                            'best_citypersons_ap':
                            best_citypersons_ap,
                            'best_cocopersons_ap':
                            best_cocopersons_ap,
                        },
                        writer.get_logdir(),
                        best_model="mot17")
                if crowdhuman_ap > best_crowdhuman_ap:
                    best_crowdhuman_ap = crowdhuman_ap
                    save_checkpoint(
                        {
                            'model':
                            model.state_dict(),
                            'optimizer':
                            optimizer.state_dict(),
                            'lr_scheduler':
                            lr_scheduler.state_dict(),
                            'warmup_lr_scheduler':
                            warmup_lr_scheduler.state_dict()
                            if warmup_lr_scheduler is not None else None,
                            'epoch':
                            epoch,
                            'iteration':
                            train_step,
                            'best_viped_ap':
                            best_viped_ap,
                            'best_mot19_ap':
                            best_mot19_ap,
                            'best_mot17_ap':
                            best_mot17_ap,
                            'best_crowdhuman_ap':
                            best_crowdhuman_ap,
                            'best_citypersons_ap':
                            best_citypersons_ap,
                            'best_cocopersons_ap':
                            best_cocopersons_ap,
                        },
                        writer.get_logdir(),
                        best_model="crowdhuman")
                if citypersons_ap > best_citypersons_ap:
                    best_citypersons_ap = citypersons_ap
                    save_checkpoint(
                        {
                            'model':
                            model.state_dict(),
                            'optimizer':
                            optimizer.state_dict(),
                            'lr_scheduler':
                            lr_scheduler.state_dict(),
                            'warmup_lr_scheduler':
                            warmup_lr_scheduler.state_dict()
                            if warmup_lr_scheduler is not None else None,
                            'epoch':
                            epoch,
                            'iteration':
                            train_step,
                            'best_viped_ap':
                            best_viped_ap,
                            'best_mot19_ap':
                            best_mot19_ap,
                            'best_mot17_ap':
                            best_mot17_ap,
                            'best_crowdhuman_ap':
                            best_crowdhuman_ap,
                            'best_citypersons_ap':
                            best_citypersons_ap,
                            'best_cocopersons_ap':
                            best_cocopersons_ap,
                        },
                        writer.get_logdir(),
                        best_model="citypersons")

                # Saving model
                save_checkpoint(
                    {
                        'model':
                        model.state_dict(),
                        'optimizer':
                        optimizer.state_dict(),
                        'lr_scheduler':
                        lr_scheduler.state_dict(),
                        'warmup_lr_scheduler':
                        warmup_lr_scheduler.state_dict()
                        if warmup_lr_scheduler is not None else None,
                        'epoch':
                        epoch,
                        'iteration':
                        train_step,
                        'best_viped_ap':
                        best_viped_ap,
                        'best_mot19_ap':
                        best_mot19_ap,
                        'best_mot17_ap':
                        best_mot17_ap,
                        'best_crowdhuman_ap':
                        best_crowdhuman_ap,
                        'best_citypersons_ap':
                        best_citypersons_ap,
                        'best_cocopersons_ap':
                        best_cocopersons_ap,
                    }, writer.get_logdir())

                # Setting again to train mode
                model.train()

            lr_scheduler.step()
def main():
    args = config_parser(base_parser()).parse_args()
    if args.detect_anomalies:
        torch.autograd.set_detect_anomaly(True)
    torch.manual_seed(42)
    torch.set_default_dtype(torch.float32)

    # load graph pdists
    g_pdists, g = load_pdists(args)
    g_pdists = torch.Tensor(g_pdists)
    n = g_pdists.shape[0]
    d = args.manifold_dim

    # masks used to get the upper diagonal part of (i) pairwise distances
    # matrices, and (ii) the embedding matrices themselves
    mask = torch.triu_indices(n, n, 1)
    e_mask = torch.triu_indices(d, d)

    # we are only using the upper diagonal part
    g_pdists = g_pdists[mask[0], mask[1]]
    # scale if needed
    if args.min_max_scale:
        g_pdists = min_max_scale(g_pdists, 1, 10)
    g_sq_pdists = g_pdists.pow(2)
    # keep a numpy copy for computing metrics
    g_pdists_np = g_pdists.cpu().numpy()

    # embedding initializations
    X_init = sample_init_points(n, d)
    # put them on GPU if available
    if torch.cuda.is_available():
        with Timer('copying data to GPU'):
            X_init = X_init.pin_memory().cuda()
            g_sq_pdists = g_sq_pdists.pin_memory().cuda()

    # the embedding paramters we optimize for
    spd = geoopt.SymmetricPositiveDefinite(wmin=1e-8, wmax=1e8)
    X = geoopt.ManifoldParameter(X_init, manifold=spd)

    # the distance function
    dist_fn = manifold_sq_pdists_stein if args.stein_div else manifold_sq_pdists

    # setup the optimizer
    # TODO(ccruceru): Investigate the momentum issues.
    optim = geoopt.optim.RiemannianSGD([X], lr=0.5)
    lr_scheduler = ReduceLROnPlateau(optim,
                                     patience=20,
                                     factor=0.5,
                                     min_lr=1e-8,
                                     verbose=args.verbose)

    # training settings
    writer = SummaryWriter()
    n_epochs = 1500
    save_every_epochs = 10

    def criterion(epoch):
        mdists = dist_fn(X)
        l1 = (mdists / g_sq_pdists - 1.0).abs().sum()
        eps = 1.0 / (epoch + 1)
        l2 = (g_sq_pdists / (mdists + eps) - 1.0).abs().sum()
        return (l1 + l2) / n

    def run_epoch(epoch):
        optim.zero_grad()
        loss = criterion(epoch)
        loss.backward()
        optim.step()
        lr_scheduler.step(loss)

        return loss

    def compute_metrics():
        with torch.no_grad():
            man_pdists_np = dist_fn(X).sqrt().cpu().numpy()
        ad = average_distortion(g_pdists_np, man_pdists_np)
        if g is None:
            return ad, None

        # TODO(ccruceru): Make sure this is correct. Try to reproduce the
        # result from the ref. paper on 10D Euclidean manifold.
        man_pdists_sym = pdists_vec_to_sym(man_pdists_np, n)
        mean_ap = mean_average_precision(g, man_pdists_sym)
        return ad, mean_ap

    with Timer('training'):
        for epoch in range(n_epochs):
            # early break if we reached the minimum learning rate
            if optim.param_groups[0]['lr'] <= 2 * lr_scheduler.min_lrs[0]:
                break
            start = time.time()
            loss = run_epoch(epoch)
            stop = time.time()
            if epoch % save_every_epochs != 0:
                continue

            # show it
            if args.verbose:
                print('epoch {:5}, loss {:.10f}, time {}'.format(
                    epoch, loss.item(), stop - start))

            # monitoring
            with torch.no_grad():
                logw = eig(X).log()
                ks = logw.max(1).values - logw.min(1).values
                ad, mean_ap = compute_metrics()
            writer.add_scalar('loss', loss, epoch)
            writer.add_histogram('log_lambda', logw.flatten(), epoch)
            writer.add_histogram('log_k_X', ks, epoch)
            writer.add_embedding(X[:, e_mask[0], e_mask[1]], global_step=epoch)
            # metrics
            writer.add_scalar('avg_distortion', ad, epoch)
            if mean_ap:
                writer.add_scalar('mAP', mean_ap, epoch)

    torch.save(X, os.path.join(writer.get_logdir(), 'x_opt.pt'))

    # final metrics
    ad, mean_ap = compute_metrics()
    print('Average distortion: ', ad)
    if mean_ap:
        print('mAP: ', mean_ap)
Example #13
0
def main():

    from config import config_enhanced
    writer = SummaryWriter(os.path.join('runs', name_dir(config_enhanced)))

    torch.multiprocessing.freeze_support()

    print("Current config_enhanced is:")
    pprint(config_enhanced)
    writer.add_text("config", str(config_enhanced))

    save_path = str(writer.get_logdir())
    try:
        os.makedirs(save_path)
    except OSError:
        pass

    # with open(os.path.join(save_path, "config.json"), 'w') as outfile:
    #     json.dump(config_enhanced, outfile)

    torch.manual_seed(config_enhanced['seed'])
    torch.cuda.manual_seed_all(config_enhanced['seed'])

    use_cuda = torch.cuda.is_available()
    if torch.cuda.is_available() and config_enhanced['cuda_deterministic']:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    # torch.set_num_threads(1)
    if use_cuda:
        device = torch.device('cuda')
        print("using GPU")
    else:
        device = torch.device('cpu')
        print("using CPU")

    if config_enhanced['num_processes'] == "num_cpu":
        num_processes = multiprocessing.cpu_count() - 1
    else:
        num_processes = config_enhanced['num_processes']

    # if torch.cuda.device_count() > 1:
    #     print("Let's use", torch.cuda.device_count(), "GPUs!")
    #     # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
    #     model = torch.nn.DataParallel(model)

    env = CholeskyTaskGraph(**config_enhanced['env_settings'])
    envs = VectorEnv(env, num_processes)
    envs.reset()

    model = SimpleNet(**config_enhanced["network_parameters"])
    if config_enhanced["model_path"]:
        model.load_state_dict(torch.load(config_enhanced['model_path']))

    actor_critic = Policy(model, envs.action_space, config_enhanced)
    actor_critic = actor_critic.to(device)

    if config_enhanced['agent'] == 'PPO':
        print("using PPO")
        agent_settings = config_enhanced['PPO_settings']
        agent = PPO(
            actor_critic,
            **agent_settings)

    elif config_enhanced['agent'] == 'A2C':
        print("using A2C")
        agent_settings = config_enhanced['A2C_settings']
        agent = A2C_ACKTR(
            actor_critic,
            **agent_settings)

    rollouts = RolloutStorage(config_enhanced['trajectory_length'], num_processes,
                              env_example.observation_space.shape, env_example.action_space)



    obs = envs.reset()
    obs = torch.tensor(obs, device=device)
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    num_updates = int(
        config_enhanced['num_env_steps']) // config_enhanced['trajectory_length'] // num_processes
    for j in range(num_updates):

        if config_enhanced['use_linear_lr_decay']:
            # decrease learning rate linearly
            utils.update_linear_schedule(
                agent.optimizer, j, num_updates, config_enhanced['network']['lr'])

        for step in tqdm(range(config_enhanced['trajectory_length'])):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob = actor_critic.act(
                    rollouts.obs[step])
            actions = action.squeeze(-1).detach().cpu().numpy()

            # Observe reward and next obs
            obs, reward, done, infos = envs.step(actions)
            obs = torch.tensor(obs, device=device)
            reward = torch.tensor(reward, device=device).unsqueeze(-1)
            done = torch.tensor(done, device=device)

            n_step = (j * config_enhanced['trajectory_length'] + step) * num_processes
            for info in infos:
                if 'episode' in info.keys():
                    reward_episode = info['episode']['r']
                    episode_rewards.append(reward_episode)
                    writer.add_scalar('reward', reward_episode, n_step)
                    writer.add_scalar('solved', int(info['episode']['length'] == envs.envs[0].max_steps))

            # If done then clean the history of observations.
            masks = torch.FloatTensor(
                [[0.0] if done_ else [1.0] for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1]).detach()

        rollouts.compute_returns(next_value, config_enhanced["use_gae"], config_enhanced["gamma"],
                                 config_enhanced['gae_lambda'], config_enhanced['use_proper_time_limits'])

        value_loss, action_loss, dist_entropy = agent.update(rollouts)
        writer.add_scalar('value loss', value_loss, n_step)
        writer.add_scalar('action loss', action_loss, n_step)
        writer.add_scalar('dist_entropy', dist_entropy, n_step)

        rollouts.after_update()

        # save for every interval-th episode or for the last epoch
        if (j % config_enhanced['save_interval'] == 0
                or j == num_updates - 1):
            save_path = str(writer.get_logdir())
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            torch.save(actor_critic, os.path.join(save_path, "model.pth"))

        if j % config_enhanced['log_interval'] == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                    .format(j, n_step,
                            int(n_step / (end - start)),
                            len(episode_rewards), np.mean(episode_rewards),
                            np.median(episode_rewards), np.min(episode_rewards),
                            np.max(episode_rewards), dist_entropy, value_loss,
                            action_loss))

        if (config_enhanced['evaluate_every'] is not None and len(episode_rewards) > 1
                and j % config_enhanced['evaluate_every'] == 0):
            eval_reward = evaluate(actor_critic, boxworld, config_enhanced, device)
            writer.add_scalar("eval reward", eval_reward, n_step)
Example #14
0
class MainLogger:
    def __init__(self, log_dir=None,comment='', name='pkgpl', level=logging.DEBUG):
        self.start=timer()
        # pytorch tensorboard writer - master only
        self.writer = SummaryWriter(log_dir=log_dir,comment=comment)
        self.get_logdir = self.writer.get_logdir
        self.add_hparams = self.writer.add_hparams
        self.add_text = self.writer.add_text
        self.add_scalar = self.writer.add_scalar
        self.add_scalars = self.writer.add_scalars
        self.add_figure = self.writer.add_figure
        self.flush = self.writer.flush
        self.loss0 = dict()

        self.add_text('Name', name)

        # python logger
        self.logger = logging.getLogger(name)
        self.stream_logger = logging.getLogger('tty')

        # file logger
        fileHandler = logging.FileHandler("%s/log.txt"%(self.get_logdir()))
        fileHandler.setFormatter(formatter)
        self.logger.addHandler(fileHandler)
        self.logger.setLevel(level)

        self.write = self.logger.debug
        self.debug = self.logger.debug
        self.info  = self.logger.info
        self.warning = self.logger.warning
        self.error = self.logger.error
        self.critical = self.logger.critical

        # stream logger
        streamHandler = logging.StreamHandler()
        streamHandler.setFormatter(formatter)
        self.stream_logger.addHandler(streamHandler)
        self.stream_logger.setLevel(level)

    def print(self, msg):
        self.logger.debug(msg)
        self.stream_logger.debug(msg)

    def get_logdir(self):
        return self.writer.get_logdir()

    def log_hparams(self, name, hparams_dict):
        self.print("%s:\n%s"%(name,yaml.dump(hparams_dict)))
        self.add_text(name, str(hparams_dict))

    def log_loss(self, loss, epoch, name='loss',filename=None, add_figure=True,log_norm=True):
        if filename is None:
            filename = name+'.txt'
        # file output
        strftime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        with open("%s/%s"%(self.get_logdir(),filename),'a') as fl:
            fl.write("%d %9.3e [%s]\n"%(epoch,loss,strftime))
        # tensorboard
        if add_figure:
            self.add_scalar(name, loss, epoch)
            if log_norm:
                if self.loss0.get(name) is None:
                    self.loss0[name] = loss
                self.add_scalar('normalized_%s'%name, loss/self.loss0[name], epoch)

    def log_gradient(self, grad, epoch, h, filename='grad.', add_figure=True,figurename='gradient',perc=99.,figsize=[15,4]):
        # file output
        g = grad.to('cpu').numpy()
        g.tofile("%s/%s%04d"%(self.get_logdir(),filename,epoch))
        # tensorboard
        if add_figure:
            fig=plot_mig(perc_clip(g,perc),h,figsize=figsize)
            self.add_figure(figurename,fig,epoch)
            plt.close(fig)

    def log_velocity(self, vel, epoch, h, filename='vel.', add_figure=True,figurename='velocity',vmin=None,vmax=None,figsize=[15,4]):
        # file output
        v = vel.to('cpu').detach().numpy()
        v.tofile("%s/%s%04d"%(self.get_logdir(),filename,epoch))
        # tensorboard
        if add_figure:
            fig = plot_vel(v,h,vmin,vmax,figsize=figsize)
            self.add_figure(figurename,fig,epoch)
            plt.close(fig)

    def output(self,model,epoch,loss,log_gradient=True):
        self.log_loss(loss,epoch)

        if log_gradient:
            grad = model.gradient()
            grad_norm = grad.norm(float('inf')).item()
            self.write("epoch %d, loss %9.3e, gnorm %9.3e"%(epoch,loss,grad_norm))
        else:
            self.write("epoch %d, loss %9.3e"%(epoch,loss))

        if epoch < model.hparams.skip_output or epoch % model.hparams.skip_output ==0:
            if log_gradient:
                self.log_gradient(grad,epoch,model.h)
            self.log_velocity(model.velocity(),epoch,model.h, vmin=model.hparams.vmin, vmax=model.hparams.vmax)
            self.flush()

#    def final(self,args,loss):
#        #hparam_dict={'lr':args.lr,'grad_norm':args.grad_norm,
#        #        'optimizer':args.optimizer,'momentum':args.momentum,
#        #        'max_epochs':args.max_epochs}
#        hparam_dict = vars(args)
#        metric_dict={'final_loss':loss}
#        self.add_hparams(hparam_dict,metric_dict)

    def progress_bar(self,count,total,status=''):
        # from https://gist.github.com/vladignatyev/06860ec2040cb497f0f3
        tavg = (timer()-self.start)/(count+1)

        bar_len=60
        frac = count/(total-1)
        filled_len = int(round(bar_len*frac))
        percents = round(100*frac,1)
        bar = '='*filled_len + '-'*(bar_len-filled_len)

        sys.stdout.write('[%s] %s%% (%s/%s,%7.2fs/it) %s\r'%(bar,percents,count,total,tavg,status))
        sys.stdout.flush()
Example #15
0
    def train_net(self,
                  train_loader,
                  n_epoc,
                  checkpoint_factor,
                  arg,
                  if_continue,
                  checkpoint_epoc=None,
                  log_dir=None):  # TODO arg is not good to pass here
        if log_dir is None:
            if self.if_condition:
                writer = SummaryWriter(comment=f'_CWGAN_GP_{self.data_name}'
                                       )  # TODO to add hyper parmeters
            else:
                writer = SummaryWriter(comment=f'_WGAN_GP_{self.data_name}')
        else:
            writer = SummaryWriter(log_dir=log_dir)

        if self.if_condition:
            test_lbl = torch.arange(self.n_class, device=DEVICE).reshape(-1, 1)
            test_lbl = test_lbl.repeat(1, 8)
            test_lbl = test_lbl.reshape(-1)
            test_noise = self.generate_noise(test_lbl.shape[0])
        else:
            test_noise = self.generate_noise(64)
        n_sample = len(train_loader.dataset)

        start_epoch = checkpoint_epoc + 1 if if_continue else 1
        for i in range(start_epoch, n_epoc + 1):
            epoc_l_d, epoc_l_g, epoc_score_p, epoc_score_f1, epoc_score_f2 = 0., 0., 0., 0., 0.
            self.conv_gen.train(), self.conv_dis.train()
            with tqdm(total=len(train_loader), desc=f"epoc: {i}") as pbar:
                for k, (real_img, real_lbl) in enumerate(train_loader):
                    if IF_CUDA:
                        real_img = real_img.cuda()
                        real_lbl = real_lbl.cuda()
                    if self.if_condition:
                        d_loss, p_score, f_score1 = self.train_d_step(
                            real_img, real_lbl)
                        g_loss, f_score2 = self.train_g_step(
                            real_img.shape[0], real_lbl)
                    else:
                        d_loss, p_score, f_score1 = self.train_d_step(
                            real_img, None)
                        g_loss, f_score2 = self.train_g_step(
                            real_img.shape[0], None)

                    batch_size = real_img.shape[0]
                    epoc_l_d += d_loss * batch_size
                    epoc_l_g += g_loss * batch_size
                    epoc_score_p += p_score * batch_size
                    epoc_score_f1 += f_score1 * batch_size
                    epoc_score_f2 += f_score2 * batch_size

                    pbar.set_postfix({
                        "d_loss": d_loss,
                        "g_loss": g_loss,
                        "p_score": p_score,
                        "f_score D": f_score1,
                        'G': f_score2
                    })
                    pbar.update()

            epoc_l_d /= n_sample
            epoc_l_g /= n_sample
            epoc_score_p /= n_sample
            epoc_score_f1 /= n_sample
            epoc_score_f2 /= n_sample
            pbar.set_postfix({
                "epoch:  d_loss": epoc_l_d,
                "g_loss": epoc_l_g,
                "p_score": epoc_score_p,
                "f_score D": epoc_score_f1,
                'G': epoc_score_f2
            })

            writer.add_scalar('loss/generator', epoc_l_g, i)
            writer.add_scalar('loss/discriminator', epoc_l_d, i)
            writer.add_scalar('score/real', epoc_score_p, i)
            writer.add_scalar('score/fake_D', epoc_score_f1, i)
            writer.add_scalar('score/fake_G', epoc_score_f2, i)

            self.conv_gen.eval(), self.conv_dis.eval()
            if self.if_condition:
                test_img = self.conv_gen(test_noise, test_lbl)
            else:
                test_img = self.conv_gen(test_noise, None)
            test_img = (
                test_img +
                1.0) / 2.0  # Note that this is important to recover the range
            test_img = test_img.reshape(test_noise.shape[0], *self.img_shape)
            writer.add_images('img', test_img, i)
            writer.flush()

            if i % checkpoint_factor == 0:
                checkpoint_dict = {
                    'arg': arg.__dict__,
                    'G': self.conv_gen.state_dict(),
                    'D': self.conv_dis.state_dict(),
                    'epoch': i,
                    'torch_seed': torch.initial_seed(),
                    'log_dir': writer.get_logdir(),
                    'opt_D': self.opt_D.state_dict(),
                    'opt_G': self.opt_G.state_dict()
                }
                save_path = os.path.join(writer.get_logdir(), f'ckpt{i}.pth')
                torch.save(checkpoint_dict, save_path)
        writer.close()
        return
Example #16
0
    """

    # Extract all feature maps
    # Hint: use conv_layer_indices to access
    with torch.no_grad():
        feature_maps = []
        x = input
        for layer in model.features:
            x = layer(x)
            if isinstance(layer, torch.nn.ReLU):
                feature_maps.append(x)
    return feature_maps

from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()
output_path = writer.get_logdir()
import os, math, random
from torchvision import utils

with torch.no_grad():
    # visualize weight
    selected_channel = [-1, -1, -1, -1, -1]
    for layer_idx, channel in zip(conv_layer_indices, selected_channel):
        weight = extract_filter(layer_idx, model)
        n, c, h, w = weight.shape
        if channel == -1:
            channel = random.randint(0, c - 1)
        weight = weight[:, channel, :, :].unsqueeze(1)
        nrow = int(math.sqrt(n))
        print(h, w)
        grid = utils.make_grid(weight, nrow=nrow, normalize=True, scale_each=True)
Example #17
0
import torch
from torch.utils.tensorboard import SummaryWriter


def setup_logger(level=logging.DEBUG):
    formatter = logging.Formatter(
        fmt='%(asctime)s - %(levelname)s - %(module)s - %(message)s')

    handler = logging.StreamHandler()
    handler.setFormatter(formatter)

    _logger = logging.getLogger(__name__)
    _logger.setLevel(level)
    _logger.addHandler(handler)
    return _logger


if __name__ == '__main__':
    writer = SummaryWriter("/tmp/runs/test_logger")
    print(writer.get_logdir())

    video = torch.rand((4, 20, 3, 100, 100), dtype=torch.float)  # N,T,C,H,W
    video = torch.clamp(video, 0., 1.)

    x = range(100)
    for i in x:
        writer.add_scalar('y=2x', i * 2, i)

    writer.add_video('videos', video, fps=20)
    writer.close()
Example #18
0
class DefaultWriter:
    """
    Default writer to be used by the agents, optionally wraps an instance of tensorboard.SummaryWriter.

    Can be used in the fit() method of the agents, so
    that training data can be handled by AgentManager and RemoteAgentManager.

    Parameters
    ----------
    name : str
        Name of the writer.
    log_interval : int
        Minimum number of seconds between consecutive logs (with logging module).
    tensorboard_kwargs : Optional[dict]
        Parameters for tensorboard SummaryWriter. If provided, DefaultWriter
        will behave as tensorboard.SummaryWriter, and will keep utilities to handle
        data added with the add_scalar method.
    execution_metadata : metadata_utils.ExecutionMetadata
        Execution metadata about the object that is using the writer.
    maxlen : Optional[int], default: None
        If given, data stored by self._data (accessed through the property self.data) is limited
        to `maxlen` entries.
    """

    def __init__(
        self,
        name: str,
        log_interval: int = 3,
        tensorboard_kwargs: Optional[dict] = None,
        execution_metadata: Optional[metadata_utils.ExecutionMetadata] = None,
        maxlen: Optional[int] = None,
    ):
        self._name = name
        self._log_interval = log_interval
        self._execution_metadata = execution_metadata
        self._data = None
        self._time_last_log = None
        self._log_time = True
        self._maxlen = maxlen
        self.reset()

        # initialize tensorboard
        if (tensorboard_kwargs is not None) and (
            not check_packages.TENSORBOARD_INSTALLED
        ):
            logger.warning(
                "[DefaultWriter]: received tensorboard_kwargs, but tensorboard is not installed."
            )
        self._tensorboard_kwargs = tensorboard_kwargs
        self._tensorboard_logdir = None
        self._summary_writer = None
        if (tensorboard_kwargs is not None) and check_packages.TENSORBOARD_INSTALLED:
            self._summary_writer = SummaryWriter(**self._tensorboard_kwargs)
            self._tensorboard_logdir = self._summary_writer.get_logdir()

    def reset(self):
        """Clear data."""
        self._data = dict()
        self._initial_time = timer()
        self._time_last_log = timer()

    @property
    def summary_writer(self):
        return self._summary_writer

    @property
    def data(self):
        df = pd.DataFrame(columns=("name", "tag", "value", "global_step"))
        for tag in self._data:
            df = pd.concat([df, pd.DataFrame(self._data[tag])], ignore_index=True)
        return df

    def add_scalar(
        self,
        tag: str,
        scalar_value: float,
        global_step: Optional[int] = None,
        walltime=None,
        new_style=False,
    ):
        """
        Behaves as SummaryWriter.add_scalar().

        Note: the tag 'dw_time_elapsed' is reserved and updated internally.
        It logs automatically the number of seconds elapsed

        Parameters
        ----------
        tag : str
            Tag for the scalar.
        scalar_value : float
            Value of the scalar.
        global_step : int
            Step where scalar was added. If None, global steps will no longer be stored for the current tag.
        walltime : float
            Optional override default walltime (time.time()) with seconds after epoch of event
        new_style : bool
            Whether to use new style (tensor field) or old
            style (simple_value field). New style could lead to faster data loading.
        """
        if self._summary_writer:
            self._summary_writer.add_scalar(
                tag, scalar_value, global_step, walltime, new_style
            )
        self._add_scalar(tag, scalar_value, global_step)

    def _add_scalar(
        self, tag: str, scalar_value: float, global_step: Optional[int] = None
    ):
        """
        Store scalar value in self._data.
        """
        # Update data structures
        if tag not in self._data:
            self._data[tag] = dict()
            self._data[tag]["name"] = deque(maxlen=self._maxlen)
            self._data[tag]["tag"] = deque(maxlen=self._maxlen)
            self._data[tag]["value"] = deque(maxlen=self._maxlen)
            self._data[tag]["global_step"] = deque(maxlen=self._maxlen)

        self._data[tag]["name"].append(
            self._name
        )  # used in plots, when aggregating several writers
        self._data[tag]["tag"].append(
            tag
        )  # useful to convert all data to a single DataFrame
        self._data[tag]["value"].append(scalar_value)
        if global_step is None:
            self._data[tag]["global_step"].append(np.nan)
        else:
            self._data[tag]["global_step"].append(global_step)

        # Append time interval corresponding to global_step
        if global_step is not None and self._log_time:
            assert tag != "dw_time_elapsed", "The tag dw_time_elapsed is reserved."
            self._log_time = False
            self._add_scalar(
                tag="dw_time_elapsed",
                scalar_value=timer() - self._initial_time,
                global_step=global_step,
            )
            self._log_time = True

        # Log
        if not self._log_time:
            self._log()

    def _log(self):
        # time since last log
        t_now = timer()
        time_elapsed = t_now - self._time_last_log
        # log if enough time has passed since the last log
        max_global_step = 0
        if time_elapsed > self._log_interval:
            self._time_last_log = t_now
            message = ""
            for tag in self._data:
                val = self._data[tag]["value"][-1]
                gstep = self._data[tag]["global_step"][-1]
                message += f"{tag} = {val} | "
                if not np.isnan(gstep):
                    max_global_step = max(max_global_step, gstep)

            header = self._name
            if self._execution_metadata:
                header += f"[worker: {self._execution_metadata.obj_worker_id}]"
            message = f"[{header}] | max_global_step = {max_global_step} | " + message
            logger.info(message)

    def __getattr__(self, attr):
        """
        Calls SummaryWriter methods, if self._summary_writer is not None.
        Otherwise, does nothing.
        """
        if attr[:2] == "__":
            raise AttributeError(attr)
        if attr in self.__dict__:
            return getattr(self, attr)
        if self._summary_writer:
            return getattr(self._summary_writer, attr)

        def method(*args, **kwargs):
            pass

        return method

    #
    # For pickle
    #
    def __getstate__(self):
        if self._summary_writer:
            self._summary_writer.close()
        state = self.__dict__.copy()
        return state

    def __setstate__(self, newstate):
        # Re-create summary writer with the same logdir
        if newstate["_summary_writer"]:
            newstate["_tensorboard_kwargs"].update(
                dict(log_dir=newstate["_tensorboard_logdir"])
            )
            newstate["_summary_writer"] = SummaryWriter(
                **newstate["_tensorboard_kwargs"]
            )
        self.__dict__.update(newstate)
Example #19
0
def run_train(args):
    # show tensorboard graphs with following command: tensorboard --logdir=src/runs
    writer = SummaryWriter(comment=args.comment)

    # Image preprocessing, normalization for the pretrained resnet
    transform = transforms.Compose([
        transforms.Resize((160, 160)), np.float32,
        transforms.ToTensor(), fixed_image_standardization
    ])

    # transform = transforms.Compose([
    #     transforms.Resize((224, 224)),
    #     transforms.ToTensor(),
    #     transforms.Normalize((0.485, 0.456, 0.406),
    #                          (0.229, 0.224, 0.225))
    # ])

    train_dataset, val_dataset, test_dataset = read_training_dataset(
        args, transform)

    tqdm.write('train data size: {}, validation data size: {}'.format(
        len(train_dataset), len(val_dataset)))

    # Build data loader
    train_loader = get_loader(train_dataset,
                              args.batch_size,
                              shuffle=True,
                              num_workers=args.num_workers)
    val_loader = get_loader(val_dataset,
                            args.batch_size,
                            shuffle=False,
                            num_workers=args.num_workers)

    # Device configuration
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print('training on', device)

    # Build the models
    model = FaceRecognitionCNN().to(device)
    if args.freeze_first_epoch:
        for m in model.resnet.parameters():
            m.requires_grad_(False)

    input_shape = next(iter(train_loader))[2].shape
    print('input shape', input_shape)
    # need to call this before summary!!!
    model.eval()
    # summary(model, input_shape[1:], batch_size=input_shape[0], device=device)
    print('model params (trainable, total):', count_parameters(model))

    # Loss and optimizer
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.learning_rate,
                                 weight_decay=args.regularization)

    # decrease learning rate if validation accuracy has not increased
    lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        mode='max',
        factor=1 / 4,
        patience=args.patience,
        verbose=True,
    )

    writer.add_hparams(args.__dict__, {})
    writer.add_text('model', str(model))

    # Train the models
    total_step = len(train_loader)
    step = 1
    best_val_acc = 0.5
    for epoch in range(args.num_epochs):
        for i, (video_ids, frame_ids, images, targets) in \
                tqdm(enumerate(train_loader), desc=f'training epoch {epoch}', total=len(train_loader)):
            model.train()
            # Set mini-batch dataset
            images = images.to(device)
            targets = targets.to(device)

            # Forward, backward and optimize
            outputs = model(images)
            loss = criterion(outputs, targets)
            model.zero_grad()
            loss.backward()
            optimizer.step()

            batch_accuracy = float(
                (outputs > 0.0).eq(targets).sum()) / len(targets)

            # Print log info
            step += 1

            if (i + 1) % args.log_step == 0:
                print_training_info(batch_accuracy, loss, step, writer)

            if (i + 1) % args.val_step == 0:
                val_acc, pr_acc, tmp_acc = print_validation_info(
                    args, criterion, device, model, val_loader, writer, epoch,
                    step)
                if val_acc > best_val_acc:
                    save_model_checkpoint(args, epoch, model,
                                          (val_acc, pr_acc, tmp_acc),
                                          writer.get_logdir())
                    best_val_acc = val_acc

        # validation step after full epoch
        val_acc, pr_acc, tmp_acc = print_validation_info(
            args, criterion, device, model, val_loader, writer, epoch, step)
        lr_scheduler.step(val_acc)
        if val_acc > best_val_acc:
            save_model_checkpoint(args, epoch, model,
                                  (val_acc, pr_acc, tmp_acc),
                                  writer.get_logdir())
            best_val_acc = val_acc

        if args.freeze_first_epoch and epoch == 0:
            for m in model.resnet.parameters():
                m.requires_grad_(True)
            tqdm.write('Fine tuning on')

    writer.close()