Example #1
0
    def load_state(self):
        # Load the state on rank 0:
        if state is not None and hvd.rank() == 0:
            self.load_state(state)

        # Broadcast the global step:
        self._global_step = hvd.broadcast_object(self._global_step,
                                                 root_rank=0)

        # Broadcast the state of the model:
        hvd.broadcast_parameters(self._net.state_dict(), root_rank=0)

        # Broadcast the optimizer state:
        hvd.broadcast_optimizer_state(self._opt, root_rank=0)

        # Horovod doesn't actually move the optimizer onto a GPU:
        if self.args.compute_mode == "GPU":
            for state in self._opt.state.values():
                for k, v in state.items():
                    if torch.is_tensor(v):
                        state[k] = v.cuda()

        # Broadcast the LR Schedule state:
        state_dict = hvd.broadcast_object(self.lr_scheduler.state_dict(),
                                          root_rank=0)
        self.lr_scheduler.load_state_dict(state_dict)
Example #2
0
    def load_checkpoint(self, net, opt, lr_sched, ctrls, load):
        if self.is_master:

            self.i_epoch = 0

            # look for a checkpoint
            ckpt_list = os.listdir(self.dir_saves)
            if len(ckpt_list) != 0:
                if load == 'best':  # used when evaluating performances
                    ckpt_file = os.path.join(self.dir_saves, 'best.ckpt')
                elif load == 'last':  # used when restoring crashed experiments/resuming interrupted experiments
                    ckpt_file = max([os.path.join(self.dir_saves, f) for f in ckpt_list], key=os.path.getctime)
                else:
                    ckpt_file = None
                # else:  # used in 'WHAT IF' experiments as an initial condition
                #     ckpt_id = str(load).rjust(__ALIGN_EPOCHS__, '0')
                #     ckpt_name = 'epoch' + ckpt_id + '.ckpt'
                #     ckpt_file = os.path.join(self.dir_saves, ckpt_name)

                # load checkpoint from file
                if self.verbose:
                    print('Loading checkpoint {} ...'.format(ckpt_file), end='')
                ckpt = torch.load(ckpt_file)
                if self.verbose:
                    print('done!')

                # restore experiment status
                self.i_epoch = ckpt['fold']['i_epoch']
                self.metrics.update(ckpt['fold']['metrics'])
                net.load_state_dict(ckpt['network'])
                opt.load_state_dict(ckpt['training']['optimizer'])
                lr_sched.load_state_dict(ckpt['training']['lr_scheduler'])
                for c, sd in zip(ctrls, ckpt['training']['quantize']):
                    c.load_state_dict(sd)

        # broadcast experiment status to worker processes
        self.i_epoch = hvd.broadcast_object(self.i_epoch, root_rank=__MASTER_PROC_RANK__, name='i_epoch')

        self.metrics = hvd.broadcast_object(self.metrics, root_rank=__MASTER_PROC_RANK__, name='metrics')

        hvd.broadcast_parameters(net.state_dict(), root_rank=__MASTER_PROC_RANK__)
        hvd.broadcast_optimizer_state(opt, root_rank=__MASTER_PROC_RANK__)
        lr_sched_state_dict = hvd.broadcast_object(lr_sched.state_dict(), root_rank=__MASTER_PROC_RANK__, name='lr_sched_state_dict')
        if not self.is_master:
            lr_sched.load_state_dict(lr_sched_state_dict)
        for i, c in enumerate(ctrls):
            csd = hvd.broadcast_object(c.state_dict(), root_rank=__MASTER_PROC_RANK__, name='controller{}'.format(i))
            if not self.is_master:
                c.load_state_dict(csd)
    def restore_model(self):

        if self._rank == 0:
            state = self.load_state_from_file()
        else:
            state = None

        if state is not None and self._rank == 0:
            self.restore_state(state)



        if self.args.framework.distributed_mode == "horovod":

            # Broadcast the global step:
            self._global_step = hvd.broadcast_object(self._global_step, root_rank = 0)

            # Broadcast the state of the model:
            hvd.broadcast_parameters(self._net.state_dict(), root_rank = 0)

            # Broadcast the optimizer state:
            hvd.broadcast_optimizer_state(self._opt, root_rank = 0)

            # Horovod doesn't actually move the optimizer onto a GPU:
            if self.args.run.compute_mode == "GPU":
                for state in self._opt.state.values():
                    for k, v in state.items():
                        if torch.is_tensor(v):
                            state[k] = v.cuda()



            # Broadcast the LR Schedule state:
            state_dict = hvd.broadcast_object(self.lr_scheduler.state_dict(), root_rank = 0)

        elif self.args.framework.distributed_mode == "DDP":

            if self.args.run.compute_mode == "GPU":
                self._net.cuda()

            self._net = torch.nn.parallel.DistributedDataParallel(self._net)



            self._global_step = MPI.COMM_WORLD.bcast(self._global_step, root=0)
            state_dict = MPI.COMM_WORLD.bcast(self.lr_scheduler.state_dict(), root=0)

        # Load the state dict:
        self.lr_scheduler.load_state_dict(state_dict)
Example #4
0
def run_api_experiment(input_features, output_features, dataset, **kwargs):
    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {"type": "concat", "fc_size": 14},
        "training": {"epochs": 2},
    }

    model = LudwigModel(config)
    output_dir = None

    try:
        # Training with csv
        _, _, output_dir = model.train(dataset=dataset, **kwargs)

        model.predict(dataset=dataset)

        # Attempt loading saved model, should broadcast successfully
        model_dir = os.path.join(output_dir, "model") if output_dir else None
        loaded_model = LudwigModel.load(model_dir)

        # Model loading should broadcast weights from coordinator
        loaded_state = loaded_model.model.state_dict()
        bcast_state = hvd.broadcast_object(loaded_state)
        for loaded, bcast in zip(loaded_state.values(), bcast_state.values()):
            assert np.allclose(loaded, bcast)
    finally:
        if output_dir:
            shutil.rmtree(output_dir, ignore_errors=True)
    def restore_experiment_from_checkpoint(self, checkpoint, loadModelOnly=False):
        """
        Restores an experiment stats, optimizer, and model from checkpoint
        Experiment is restored to CPU!
        When restoring a model, look for the status flag 'quantized'
        If the model has been quantized, but the experiment doesn't require quantization,
        then error out.
        :param checkpoint: Checkpoint path
        :return: None
        """

        map_device = torch.device('cpu')
        #Broadcast the experiment status to all workers
        if self.multiprocessing is True:
            if hvd.rank() == 0:
                state_dict = torch.load(checkpoint, map_location=map_device)
            else:
                state_dict = None
            state_dict = hvd.broadcast_object(state_dict, root_rank=0, name='state_dict')
        else:
            state_dict = torch.load(checkpoint, map_location=map_device)

        experimentStatus = state_dict['experimentStatus']
        #self.experimentStatus = experimentStatus
        for selfKey in self.experimentStatus.keys():
            if selfKey in experimentStatus.keys():
                self.experimentStatus[selfKey] = experimentStatus[selfKey]
        # self.experimentStatus.numEpochTrained = 0
        if loadModelOnly is False:
            config = state_dict['experimentConfig']
            # self.config = config
            for selfKey in self.config.keys():
                if selfKey in config.keys():
                    self.config[selfKey] = config[selfKey]
            # Save the optimizer state
            self.optimizerStateDict = state_dict['optimizer']
        else:
            self.experimentStatus.numEpochTrained = 0
            self.experimentStatus.numPhaseTrained = 0

        # Load the model
        # If pruning and quantization are both required,
        # then prune before quantize.
        # Otherwise the model might be pruned twice
        if experimentStatus.flagPruned is True:
            # If the network has been sparsified, then it no longer has
            # 'weight' as a registered buffer.
            # Instead, it has weight_orig and weight_mask
            # We need to allocate a sparsified network before loading the model's state dict
            # The target sparsity used to allocate the "sparsified network" can be random
            self.prune_network(sparsityTarget=0.0)

        # Check for whether it makes sense to load a quantized experiment
        # If so, quantize the model before proceed with loading
        if experimentStatus.flagFusedQuantized is True:
            assert self.config.quantize is True, \
                'Loaded experiment contains quantized model, but the experiment config does not require quantization'
            self.quantize_model()

        self.restore_model_from_state_dict(state_dict['model'])
Example #6
0
    def close_fold(self):
        if self.is_master:
            self.writer.close()
            if self.verbose:
                print('Fold [{}/{}] completed'.format(self.i_fold + 1, self.config['experiment']['n_folds']))
            self.i_fold += 1

        # communicate current fold to worker processes
        self.i_fold = hvd.broadcast_object(self.i_fold, root_rank=__MASTER_PROC_RANK__, name='i_fold_new')
Example #7
0
    def get_training_status(self):
        if self.is_master:
            # which fold should be resumed (i.e., the last)?
            folds_list = os.listdir(self.dir_exp)
            try:
                i_fold = max([int(f.replace('fold', '')) for f in folds_list])
            except ValueError:
                i_fold = 0
            self.i_fold = i_fold

        # communicate current fold to worker processes
        self.i_fold = hvd.broadcast_object(self.i_fold, root_rank=__MASTER_PROC_RANK__, name='i_fold')
Example #8
0
    def get_speed(self, index):
        if index in self.map_sec:
            return self.map_sec[index]
        self.grace.memory.clean()
        self.grace.compressor.clean()
        self.grace.memory.partition([index])

        # obtain the mean speed of multiple iterations
        comm_time_per_iter = self.get_avg_comm(self.benchmark_step)
        # negotiate the training speed with other workers
        comm_time_per_iter = hvd.broadcast_object(comm_time_per_iter,
                                                  root_rank=0)

        self.map_sec[index] = comm_time_per_iter
        print("benchmark time: {:.3f} ms\tindex: {}".format(
            comm_time_per_iter * 1000, index))
        return comm_time_per_iter
Example #9
0
def synchronize():
    global _USE_HVD
    if _USE_HVD:
        hvd.broadcast_object(0)
        return
    return comm.synchronize()
Example #10
0
    # trainer args
    parser.add_argument('--skip_output',type=int,default=10,help='epoch skip for grad/vel output')
    parser.add_argument('--max_epochs',type=int,default=501,help='total number of epochs')
    args = parser.parse_args()

    if args.device=='cuda' and torch.cuda.is_available():
        torch.cuda.set_device(hvd.local_rank())

    # logging
    logdir=''
    if hvd.rank()==0:
        logger= MainLogger(name=args.name)
        logdir = logger.get_logdir()
        logger.print("Inversion start: %s (log_dir=%s)"%(args.name, logdir))
        logger.print("hyper parameters: %s"%args)
    logdir = hvd.broadcast_object(logdir, 0)
    joblogger=JobLogger(hvd.rank(),logdir)

    model = TimeInv(args)
    model.to(args.device)

    nshot,sxy = rsf.fromfile(args.fshot,"n2 data")
    dataloader= time_distributed_dataloader(args.ftrue, torch.from_numpy(sxy))
    optimizer = model.configure_optimizers()

    model.train()
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    for epoch in range(args.max_epochs+1):
 def broadcast(self, obj, src=0):
     self.barrier()
     obj = hvd.broadcast_object(obj, src)
     return obj
Example #12
0
    config.hvd = hvd

    hvd.init()
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    torch.backends.cudnn.deterministic = True

    torch.cuda.set_device(hvd.local_rank())

    config.rank = hvd.rank()
    config.world = hvd.size()

    if hvd.local_rank() == 0:
        utils.download_model(config)
    hvd.broadcast_object(0, root_rank=0)
    model = x.Model(config)

    start_time = time.time()
    print('Loading dataset')
    train_data, dev_data, test_data = utils.build_dataset(config)

    train_iter = utils.build_dataloader(train_data, config)
    dev_iter = utils.build_dataloader(dev_data, config)
    test_iter = utils.build_dataloader(test_data, config)

    time_dif = utils.get_time_dif(start_time)
    print("Prepare data time: ", time_dif)

    # Train, eval, test
    model = model.to(config.device)
Example #13
0
def distribute_optimizer_state(optimizer: torch.optim.Optimizer):
    """Distributes the optimizer state if horovod is available"""
    if HAVE_HOROVOD:
        state_dict = hvd.broadcast_object(optimizer.state_dict(), root_rank=0)
        if hvd.rank() > 0:
            optimizer.load_state_dict(state_dict)
Example #14
0
def startup(gpu, args, config):
    hvd.init()
    torch.cuda.set_device(hvd.local_rank())
    torch.manual_seed(7)
    args.rank = hvd.rank()
    # args.rank = args.nr * args.gpus + gpu
    # print('rank', args.rank,  'gpu', gpu, 'worldsize', args.world_size)
    # distributed.init_process_group(
    #     backend='nccl', init_method='env://', world_size=args.world_size, rank=args.rank)
    torch.set_num_threads(1)

    experiment = setup_comet_ml(args, args.rank)

    # model
    model = Transducer(config)

    if config.model.random_init:
        for param in model.parameters():
            torch.nn.init.uniform(param, -0.1, 0.1)

    model.preload(config.model.preload_from)
    model.preload_lm(config.model.dec.pretrain_file)

    model.cuda()

    # torch.cuda.set_device(gpu)

    # model = parallel.DistributedDataParallel(model, device_ids=[gpu])

    # data
    d_params = Data.parameters
    d_params['freq_mask'] = config.data.freq_mask
    d_params['time_mask'] = config.data.time_mask

    train_dataset = Data(
        mean=config.data.mean, std=config.data.std,
        json_path=config.data.train_json,
        order_time_feature=True,
        tokenizer=config.model.tokenizer,
        bpe_size=config.model.bpe_size,
        cache_dir=config.model.bpe_cache_dir,
        adaptive_specaug=config.data.adaptive_specaug,
        time_repeats=config.data.time_repeats,
        **d_params
    )

    test_dataset = Data(
        mean=config.data.mean, std=config.data.std,
        json_path=config.data.valid_json,
        order_time_feature=True,
        tokenizer=config.model.tokenizer,
        bpe_size=config.model.bpe_size,
        cache_dir=config.model.bpe_cache_dir,
        adaptive_specaug=config.data.adaptive_specaug,
        time_repeats=config.data.time_repeats,
        **d_params, valid=True
    )

    train_sampler = data.distributed.DistributedSampler(train_dataset,
                                                        num_replicas=hvd.size(),
                                                        rank=hvd.rank())
    test_sampler = data.distributed.DistributedSampler(test_dataset,
                                                       num_replicas=hvd.size(),
                                                       rank=hvd.rank())

    train_loader = data.DataLoader(dataset=train_dataset,
                                   batch_size=config.dist_train.batch_size,
                                   shuffle=train_sampler is None,
                                   num_workers=args.data_workers,
                                   pin_memory=True,
                                   collate_fn=collate_fn_padd if config.model.type == "BasicRNNT" else collate_fn_padd_order,
                                   drop_last=True,
                                   sampler=train_sampler)

    test_loader = data.DataLoader(dataset=test_dataset,
                                  batch_size=config.dist_train.batch_size,
                                  shuffle=False,
                                  num_workers=args.data_workers,
                                  collate_fn=collate_fn_padd if config.model.type == "BasicRNNT" else collate_fn_padd_order,
                                  drop_last=True,
                                  pin_memory=True,
                                  sampler=test_sampler)

    # freeze encoder if specified
    if config.model.enc.freeze:
        for param in model.encoder.parameters():
            param.requires_grad = False


    # define optimizer and loss
    optimizer = optim.AdamW(
        filter(lambda p: p.requires_grad, model.parameters()),
        lr=config.dist_train.learning_rate,
        weight_decay=config.dist_train.weight_decay
    )

    scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=config.dist_train.learning_rate,
                                              steps_per_epoch=int(len(train_loader) / config.dist_train.grad_acc_steps),
                                              epochs=config.dist_train.epochs,
                                              div_factor=config.dist_train.div_factor,
                                              pct_start=config.dist_train.pct_start,
                                              anneal_strategy='linear')

    optimizer = hvd.DistributedOptimizer(
        optimizer, named_parameters=model.named_parameters(),
        backward_passes_per_step=config.dist_train.grad_acc_steps,
        op=hvd.Adasum if args.use_adasum else hvd.Average)

    if args.load_model_from and args.rank == 0:
        print("LOADING MODEL FROM", args.load_model_from)
        checkpoint = torch.load(args.load_model_from)
        model.load_state_dict(checkpoint['model_state_dict'])
        # optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    # logging in terminal and comet
    h_params = {}

    h_params.update({
        "batch_size": config.dist_train.batch_size,
        "grad_acc": config.dist_train.grad_acc_steps,
        "virtual_batch_size": config.dist_train.batch_size * config.dist_train.grad_acc_steps,
        "learning_rate": config.dist_train.learning_rate,
        "optimizer": optimizer.__class__.__name__,
        "scheduler": scheduler.__class__.__name__,
    })
    num_params = sum([param.nelement() for param in model.parameters()])
    if args.rank == 0:
        print(model)
        print(h_params)
        print(d_params)
        print('number of model parameters: ', num_params)
        print("\n train dataset summary \n", train_dataset.describe())
        print("\n test dataset summary \n", test_dataset.describe())
        print("\n data transforms \n", train_dataset.audio_transforms, test_dataset.audio_transforms)
    if args.rank == 0:
        experiment.log_parameters(h_params)
        experiment.log_parameters(d_params)
        experiment.set_name(config.comet_info.exp_name)  # experiment name
        experiment.log_others(vars(args))
        experiment.log_other('train_summary', str(train_dataset.describe()))
        experiment.log_other('test_summary', str(test_dataset.describe()))
        experiment.log_other('train_data_transforms', str(train_dataset.audio_transforms))
        experiment.log_other('valid_data_transforms', str(test_dataset.audio_transforms))
        experiment.log_other('n_model_params', num_params)

    # save args to file
    if args.rank == 0:
        ckpt_dir = os.path.join(config.dist_train.checkpoint_dir, config.comet_info.exp_name)
        if not os.path.isdir(ckpt_dir):
            os.makedirs(ckpt_dir)
        params_file = os.path.join(ckpt_dir, "args.txt")
        pretty_json = json.dumps(vars(args), sort_keys=True, indent=4)
        with open(params_file, 'w+') as f:
            f.write(pretty_json)
        print(pretty_json)

    # # resume from checkpoint
    # distributed.barrier()  # block processes until enter loading
    args.start_epoch = 1
    args.start_step = 1
    args.total_iter = 0
    if args.resume_from:
        if args.rank == 0:
            print("LOADING FROM CHECKPOINT...", args.resume_from)
            checkpoint = torch.load(args.resume_from, map_location=lambda storage, loc: storage)
            model.load_state_dict(checkpoint['model_state_dict'])
            model.cuda()
            # optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
            args.start_epoch = checkpoint['epoch']
            if args.resume_step > 0:
                args.start_step = args.resume_step
            else:
                args.start_step = checkpoint['step']
            args.total_iter = checkpoint['total_iter']
        # distributed.barrier()  # block process until finish loading
        print('broadcasting model state')
        hvd.broadcast_parameters(model.state_dict(), root_rank=0)
        print('broadcasting optimizer state..')
        # new_optimizer_state = hvd.broadcast_object(optimizer.state_dict(), root_rank=0)
        # optimizer.load_state_dict(new_optimizer_state)
        print('broadcasting scheduler state..')
        new_scheduler_state = hvd.broadcast_object(scheduler.state_dict(), root_rank=0)
        scheduler.load_state_dict(new_scheduler_state)
        print('broadcasting other args')
        args.start_epoch = hvd.broadcast_object(args.start_epoch, root_rank=0)
        args.start_step = hvd.broadcast_object(args.start_step, root_rank=0)
        args.total_iter = hvd.broadcast_object(args.total_iter, root_rank=0)

    train(args, args.rank, experiment, model, optimizer, scheduler, train_loader, test_loader, train_sampler, config)
    def __init__(self, configFile, multiprocessing=False):
        """
        Initialize the basic configuration of an experiment object.
        Children initialization implementations should also
        1) Instantiate and initialize model
        :param configFile:
        :param multiprocessing:
        """
        # TODO: Children should provide their own init method
        # 1) Load configuration file
        # 2) Instantiate data loader and samplers
        # 3) Instantiate train and validation meters

        # Experiment states initialization
        status = generate_experiment_status()
        self.experimentStatus = status
        self.multiprocessing = multiprocessing

        qatRoundedConfig = torch.quantization.FakeQuantize.with_args(
            observer=custom_quant.RoundedMovingAverageMinMaxObserver,
            quant_min=-128,
            quant_max=127,
            averaging_constant=0.01
        )

        self.qatConfig = torch.quantization.QConfig(
            activation=qatRoundedConfig
            , weight=qatRoundedConfig
        )

        # Placeholder reference to optimizer state
        # To be populated if restoring the experiment from checkpoint
        self.optimizerStateDict = None

        # TODO: Initialize these in the concrete __init__() method of each derived class
        self.model = None
        self.trainDataSet = None
        self.trainDataLoader = None
        self.trainDataSampler = None
        self.valDataSet = None
        self.valDataLoader = None
        self.valDataSampler = None
        self.logWriter = None
        self.trainMeter = None
        self.valMeter = None
        self.trainTimeMeter = None

        # Load experiment setting from config file
        config = generate_base_config()
        if (multiprocessing is False) or (multiprocessing is True and hvd.rank() == 0):
            try:
                file = open(configFile, "r")
            except IOError:
                raise ValueError("The provided configuration file cannot be opened.")
            with file:
                yamlConfig = yaml.load(file, Loader=yaml.FullLoader)
                config = edict(yamlConfig)

        # Broadcast the configuration to the workers during multiprocessing
        if multiprocessing is True:
            config = hvd.broadcast_object(obj=config, root_rank=0, name='config')

        self.config = config
        torch.manual_seed(self.config.seed)
        # Set intra-op parallelism threads
        torch.set_num_threads(self.config.numThreadsPerWorker)
Example #16
0
 def broadcast(self, obj, src=0):
     obj = hvd.broadcast_object(obj, src)
     return obj
Example #17
0
    def _setup_experiment(self, exp_id):
        """Get pointers to the data and experiment folders.

        Args:
            exp_id (str): The decimal literal identifying the experiment.

        """
        if self.is_master:
            QUANT_HOME = sys.path[0]
            # get pointers to HARD SHARED resources
            HARD_STORAGE = os.path.join(QUANT_HOME, 'cfg', 'hard_storage.json')
            with open(HARD_STORAGE, 'r') as fp:
                d = json.load(fp)
                # data
                HARD_HOME_DATA = os.path.join(d['data'], 'Quant')
                HARD_DIR_DATA = os.path.join(HARD_HOME_DATA, 'problems', self.problem, 'data')
                if not os.path.isdir(HARD_DIR_DATA):
                    raise FileNotFoundError('{} hard directory (data) not found: {}'.format(self.problem, HARD_DIR_DATA))
                # log
                HARD_HOME_LOGS = os.path.join(d['logs'], 'Quant')
                HARD_DIR_LOGS = os.path.join(HARD_HOME_LOGS, 'problems', self.problem, 'logs')
                if not os.path.isdir(HARD_DIR_LOGS):
                    raise FileNotFoundError('{} hard directory (logs) not found: {}'.format(self.problem, HARD_DIR_LOGS))
            # get pointers to SOFT SHARED resources (which are redirected to HARD ones using symlinks)
            DIR_PROBLEM = os.path.join(QUANT_HOME, 'problems', self.problem)
            dir_data = os.path.join(DIR_PROBLEM, 'data')
            if not os.path.isdir(dir_data):
                os.symlink(HARD_DIR_DATA, dir_data)
            dir_logs = os.path.join(DIR_PROBLEM, 'logs')
            if not os.path.isdir(dir_logs):
                os.symlink(HARD_DIR_LOGS, dir_logs)
            # get pointers to PRIVATE experiment resources
            if exp_id:
                # retrieve an existing report
                exp_id = int(exp_id)
            else:
                # create a new report
                exp_folders = [f for f in os.listdir(dir_logs) if f.startswith('exp')]
                if len(exp_folders) == 0:
                    exp_id = 0
                else:
                    exp_id = max(int(f.replace('exp', '')) for f in exp_folders) + 1
            dir_exp = os.path.join(dir_logs, 'exp'+str(exp_id).rjust(__ALIGN_EXP__, '0'))
            if not os.path.isdir(dir_exp):
                os.mkdir(dir_exp)

            self.dir_data = dir_data
            self.dir_exp  = dir_exp

            if self.verbose:
                # print setup message
                message  = 'EXPERIMENT LOGBOOK\n'
                message += 'Problem:               {}\n'.format(self.problem)
                message += 'Network topology:      {}\n'.format(self.topology)
                message += 'Data directory:        {}\n'.format(self.dir_data)
                message += 'Experiment directory:  {}\n'.format(self.dir_exp)
    
                def print_message(message):
                    """Print a nice delimiter around a multiline message."""
                    lines = message.splitlines()
                    tab_size = 4
                    width = max(len(l) for l in lines) + tab_size
                    print('+' + '-' * width + '+')
                    for l in lines:
                        print(l)
                    print('+' + '-' * width + '+')
    
                print_message(message)

            # load configuration
            private_config_file = os.path.join(self.dir_exp, 'config.json')

            if not os.path.isfile(private_config_file):
                # no configuration in the experiment folder: look for global one
                shared_config_file = os.path.join(os.path.dirname(self.lib.__file__), 'config.json')
                if not os.path.isfile(shared_config_file):
                    raise FileNotFoundError('Configuration file not found: {}'.format(shared_config_file))
                shutil.copyfile(shared_config_file, private_config_file)
                # generate seed for experiment
                with open(private_config_file, 'r+') as fp:
                    config = json.load(fp)
                    config['experiment']['seed'] = torch.randint(__MAX_SEED__, (1,)).item()
                    fp.seek(0)
                    json.dump(config, fp, indent=4)
                    fp.truncate()

            with open(private_config_file, 'r') as fp:
                self.config = json.load(fp)

        # communicate data pointer and experiment configuration to worker processes
        self.dir_data = hvd.broadcast_object(self.dir_data, root_rank=__MASTER_PROC_RANK__, name='dir_data')
        self.config = hvd.broadcast_object(self.config, root_rank=__MASTER_PROC_RANK__, name='config')
Example #18
0
 def broadcast(self, obj: object, src: int = 0) -> object:
     obj = hvd.broadcast_object(obj, src)
     return obj
Example #19
0
    def train(self):
        dset = ConcatDataset(
            [eval(cls)(**params) for cls, params in self.dataset])
        # eval(cls) means to call the Dataset,e.g:DAVISDataset
        # (**params) means to delivery the initial params[dict] into Dataset. e.g:DAVISDataset(params)
        # Finally, concat these Datasets.

        # Partition dataset among workers using DistributedSampler
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            dset, num_replicas=hvd.size(), rank=hvd.rank())

        loader = DataLoader(dset,
                            batch_size=self.batch_size,
                            sampler=train_sampler,
                            num_workers=self.num_workers,
                            pin_memory=True,
                            shuffle=False)

        # Add Horovod Distributed Optimizer
        backward_passes_per_step = dset.datasets[
            0].sample_size - 1  # e.g:3 frames has 2 backward()
        self.optimizer = hvd.DistributedOptimizer(
            self.optimizer,
            named_parameters=self.model.named_parameters(),
            backward_passes_per_step=backward_passes_per_step)
        # Broadcast parameters from rank 0 to all other processes.
        hvd.broadcast_parameters(self.model.state_dict(), root_rank=0)

        for epoch in range(self.epoch + 1, self.max_epochs + 1):

            self.epoch = epoch
            self.stats = ddict(AverageMeter)
            t0 = None
            runtime = AverageMeter()

            for i, batch in enumerate(loader, 1):
                t0 = time(
                ) if t0 is None else t0  # Ignore loader startup pause

                self.optimizer.zero_grad()
                stats = self.model(*batch)
                self.optimizer.step()

                runtime.update(time() - t0)
                t0 = time()

                stats['stats/lr'] = self.scheduler.get_last_lr()[0]
                self.update_stats(stats,
                                  i,
                                  len(loader),
                                  runtime,
                                  do_print=True)

            if hvd.rank() == 0:
                self.log_stats()  # tensorboard
                self.scheduler.step()
            lr_dict = hvd.broadcast_object(self.scheduler.state_dict(), 0)
            if hvd.rank() > 0:
                self.scheduler.load_state_dict(lr_dict)

            if self.epoch % self.save_interval == 0 and hvd.rank() == 0:
                self.save_checkpoint()

        print("%s done" % self.name)