Example #1
0
            module = DQMLP(self.obs_dim, self.n_actions, 64)

        module.apply(weight_init)
        return module


def flatten(d, parent_key='', sep='/'):
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, DictConfig):
            items.extend(flatten(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)


@hydra.main()
def my_app(cfg: DictConfig) -> None:
    f = flatten(cfg)
    print(f)
    exp = Experiment(f, create_env, create_agent)
    exp.go()


if __name__ == "__main__":
    import torch.multiprocessing as mp
    mp.set_start_method("spawn")

    my_app()
Example #2
0
                        type=str,
                        default=None,
                        metavar='LOAD_FP',
                        help='path to load model (default: None)')
    parser.add_argument('--verbose',
                        type=int,
                        default=1,
                        metavar='VERBOSITY',
                        help='verbosity, 2: All, 1: Some, 0: None')
    parser.add_argument('--train', action='store_true', help="perform train")
    parser.add_argument('--test', action='store_true', help="perform test")

    args = parser.parse_args()

    torch.manual_seed(args.seed)
    mp.set_start_method('spawn')

    for i in range(20):
        run(args)
        with open(f"run-{i}.txt", "w") as fw:
            with open(f"output", "r") as fr:
                fw.write(fr.read())
        with open(f"output", "w") as fr:
            fr.write("")

        bashCommand = f"cp {args.save_fp} {i}.tar"
        process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
        output, error = process.communicate()
        bashCommand = f"cp {os.path.splitext(args.save_fp)[0]}-ckt.tar {i}-ckt.tar"
        process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
        output, error = process.communicate()
Example #3
0
        spec = spec_util.get_eval_spec(spec_file, prename)

    if 'spec_params' not in spec:
        run_spec(spec, lab_mode)
    else:  # spec is parametrized; run them in parallel using ray
        param_specs = spec_util.get_param_specs(spec)
        search.run_param_specs(param_specs)


def main():
    '''Main method to run jobs from scheduler or from a spec directly'''
    args = sys.argv[1:]
    if len(args) <= 1:  # use scheduler
        job_file = args[0] if len(args) == 1 else 'job/experiments.json'
        for spec_file, spec_and_mode in util.read(job_file).items():
            for spec_name, lab_mode in spec_and_mode.items():
                read_spec_and_run(spec_file, spec_name, lab_mode)
    else:  # run single spec
        assert len(
            args
        ) == 3, f'To use sys args, specify spec_file, spec_name, lab_mode'
        read_spec_and_run(*args)


if __name__ == '__main__':
    try:
        mp.set_start_method('spawn')  # for distributed pytorch to work
    except RuntimeError:
        pass
    main()
Example #4
0
                    # if self.name == 'w0':
                    #     self.env.render()
                    push_and_pull(self.opt, self.lnet, self.gnet, done, s_,
                                  buffer_s, buffer_a, buffer_r, GAMMA)
                    buffer_s, buffer_a, buffer_r = [], [], []

                    if done:  # done and print information
                        record(self.g_ep, self.ep_r, self.res_queue, self.name,
                               self.lives_sum, DIE_PENALTY)
                        break
                s = s_
        self.res_queue.put(None)


if __name__ == "__main__":
    mp.set_start_method('spawn', force=True)
    gnet = Net(N_A)  # global network
    model_dir = DIR + "model/*"  # load recent trained global network
    model_files = sorted(glob.iglob(model_dir),
                         key=os.path.getctime,
                         reverse=True)
    if len(model_files) != 0:
        gnet.load_state_dict(torch.load(model_files[0]))
        print("load global network from ", model_files[0])
    gnet.share_memory()  # share the global parameters in multiprocessing
    opt = SharedAdam(gnet.parameters(), lr=0.0005)  # global optimizer
    global_ep, global_ep_r, res_queue = mp.Value('i',
                                                 0), mp.Value('d',
                                                              0.), mp.Queue()

    start_time = str(datetime.datetime.now())  # to track the time
Example #5
0
def main():
    print('Starting.')

    setproctitle.setproctitle('A3C Manager')
    args = flag_parser.parse_arguments()

    create_shared_model = model.Model
    init_agent = agent.A3CAgent
    optimizer_type = optimizer_class(args.optimizer)

    start_time = time.time()
    local_start_time_str = \
        time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime(start_time))

    # Seed sources of randomness.
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    random.seed(args.seed)

    if args.enable_logging:
        from tensorboardX import SummaryWriter
        log_dir = 'runs/' + args.title + '-' + local_start_time_str
        log_writer = SummaryWriter(log_dir=log_dir)

    if args.gpu_ids == -1:
        args.gpu_ids = [-1]
    else:
        torch.cuda.manual_seed(args.seed)
        mp.set_start_method('spawn', force=True)

    print('=> Creating the shared model and optimizer.')
    shared_model = create_shared_model(args)

    shared_model.share_memory()
    optimizer = optimizer_type(
        filter(lambda p: p.requires_grad, shared_model.parameters()), args)
    optimizer.share_memory()

    if (args.resume):
        shared_model.load_state_dict(torch.load('./models/last_model'))
    elif (args.load_model != ''):
        shared_model.load_state_dict(torch.load(args.load_model))

    print('=> Creating the agents.')
    processes = []

    end_flag = mp.Value(ctypes.c_bool, False)

    train_res_queue = mp.Queue()
    for rank in range(0, args.workers):
        p = mp.Process(target=train.train,
                       args=(rank, args, create_shared_model, shared_model,
                             init_agent, optimizer, train_res_queue, end_flag))
        p.start()
        processes.append(p)
        print('* Agent created.')
        time.sleep(0.1)

    train_total_ep = 0
    n_frames = 0

    train_thin = args.train_thin
    train_scalars = ScalarMeanTracker()

    success_tracker = []

    try:
        while train_total_ep < args.num_train_episodes:
            train_result = train_res_queue.get()
            train_scalars.add_scalars(train_result)
            train_total_ep += 1
            n_frames += train_result["ep_length"]
            if train_total_ep % 100 == 0:
                torch.save(shared_model.state_dict(),
                           './models/model_{}'.format(train_total_ep))
            if args.enable_logging and train_total_ep % train_thin == 0:
                log_writer.add_scalar("n_frames", n_frames, train_total_ep)
                tracked_means = train_scalars.pop_and_reset()
                for k in tracked_means:
                    log_writer.add_scalar(k + "/train", tracked_means[k],
                                          train_total_ep)
            success_tracker.append(train_result["success"])
            if len(success_tracker) > 100:
                success_tracker.pop(0)
            if len(success_tracker) >= 100 and sum(success_tracker) / len(
                    success_tracker) > args.train_threshold:
                break
    finally:
        if args.enable_logging:
            log_writer.close()
        end_flag.value = True
        for p in processes:
            time.sleep(0.1)
            p.join()

    torch.save(shared_model.state_dict(), './models/last_model')
    save_dir = args.save_dir
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    if not os.path.exists(os.path.join(save_dir, 'model_pkl')):
        os.mkdir(os.path.join(save_dir, 'model_pkl'))
    if not os.path.exists(os.path.join(save_dir, 'results')):
        os.mkdir(os.path.join(save_dir, 'results'))
    if not os.path.exists(os.path.join(save_dir, 'pre_train_models')):
        os.mkdir(os.path.join(save_dir, 'pre_train_models'))
    # 2. build architecture training dataset
    arch_dataset = build_open_search_space_dataset(args.search_space)
    logger = setup_logger("nasbench_open_%s_cifar10" % args.search_space, args.save_dir, 0, log_level=args.log_level)
    algo_info = algo_params_open_domain(args.algorithm)
    algo_info['total_queries'] = args.budget
    starttime = time.time()
    multiprocessing.set_start_method('spawn')
    temp_k = 10
    file_name = save_dir + '/results/%s_%d.pkl' % (algo_info['algo_name'], algo_info['total_queries'])

    data = build_open_algos(algo_info['algo_name'])(search_space=arch_dataset,
                                                    algo_info=algo_info,
                                                    logger=logger,
                                                    gpus=args.gpus,
                                                    save_dir=save_dir,
                                                    seed=args.seed)
    if 'random' in algo_info['algo_name']:
        results, result_keys = compute_best_test_losses(data, temp_k, total_queries=algo_info['total_queries'])
        algo_result = np.round(results, 5)
    else:
        results, result_keys = compute_darts_test_losses(data, temp_k, total_queries=algo_info['total_queries'])
        algo_result = np.round(results, 5)
def main():
    ###Enter Main Func:
    mp.set_start_method('spawn')
    #create model
    torch.set_default_tensor_type('torch.cuda.FloatTensor')
    torch.cuda.set_device(0)

    MODEL_NAME = 'se_resnext50_32x4d'
    MODEL_DIR = op.join('weights',MODEL_NAME)
    BEST_DIR = op.join('weights','best_models')
    if not op.isdir(MODEL_DIR):
        os.mkdir(MODEL_DIR)
    if not op.isdir(BEST_DIR):
        os.mkdir(BEST_DIR)

    #    if args.basenet == 'MultiModal':
    model = MultiModalNet(MODEL_NAME, 'DPN26', 0.5)    
    #        #net = Networktorch.nn.DataParallel(Network, device_ids=[0])
    #    elif  args.basenet == 'oct_resnet101':
    #        model = oct_resnet101()
    #        #net = Networktorch.nn.DataParallel(Network, device_ids=[0])

    model = model.cuda()
    cudnn.benchmark = True
    RESUME = False
    # MODEL_PATH = './weights/best_models/se_resnext50_32x4d_SGD_w_46.pth'
    pthlist = [i for i in os.listdir(MODEL_DIR) if i[-4:]=='.pth']
    if len(pthlist)>0:
        pthlist.sort(key=lambda x:eval(re.findall(r'\d+',x)[-1]))
        MODEL_PATH = op.join(MODEL_DIR,pthlist[-1])
        model.load_state_dict(torch.load(MODEL_PATH))
        RESUME = True

    # Dataset
    Aug = Augmentation()
    Dataset_train = MM_BDXJTU2019(root = args.dataset_root, mode = 'MM_train', 
                                  transform = Aug, TRAIN_IMAGE_DIR = 'train_image_raw')
    #weights = [class_ration[label] for data,label in Dataset_train]
    Dataloader_train = data.DataLoader(Dataset_train, args.batch_size, 
                                 num_workers = args.num_workers,
                                 shuffle = True, pin_memory = True)

    Dataset_val = MM_BDXJTU2019(root = args.dataset_root, mode = 'val', 
                                TRAIN_IMAGE_DIR = 'train_image_raw')
    Dataloader_val = data.DataLoader(Dataset_val, batch_size = 100,
                                 num_workers = args.num_workers,
                                 shuffle = True, pin_memory = True)

    criterion = nn.CrossEntropyLoss(weight = weights).cuda()
    Optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr = args.lr, momentum = args.momentum,
                          weight_decay = args.weight_decay)

    args.start_epoch = eval(re.findall(r'\d+',MODEL_PATH)[-1]) if RESUME else 0
    best_pred1,best_preds = 0,{}
    if RESUME and op.isfile(best_log):
        best_preds = json.load(open(best_log))
        best_pred1 = best_preds['best_pred1']
    elif len(os.listdir(BEST_DIR))>0:
        best_model = MultiModalNet(MODEL_NAME, 'DPN26', 0.5).cuda()
        pthlist = [i for i in os.listdir(BEST_DIR) if i[-4:]=='.pth']
        pthlist.sort(key=lambda x:eval(re.findall(r'\d+',x)[-1]))
        best_model.load_state_dict(torch.load(op.join(BEST_DIR,pthlist[-1])))
        best_pred1 = validate(Dataloader_val, best_model, criterion, printable=False)[0]
        log_dict(eval(re.findall(r'\d+',pthlist[-1])[-1]),best_pred1)

    log('#Resume: Another Start from Epoch {}'.format(args.start_epoch+1))
    for epoch in range(args.start_epoch, args.epochs):
        adjust_learning_rate(Optimizer, epoch)
        # train for one epoch
        train(Dataloader_train, model, criterion, Optimizer, epoch)    #train(Dataloader_train, Network, criterion, Optimizer, epoch)
        # evaluate on validation set
        pred1,pred5 = validate(Dataloader_val, model, criterion)  #pred1 = validate(Dataloader_val, Network, criterion)
        # remember best pred@1 and save checkpoint

        COMMON_MODEL_PATH = op.join(MODEL_DIR,'SGD_1_fold1_{}.pth'.format(epoch+1))
        BEST_MODEL_PATH   = op.join(BEST_DIR,'{}_SGD_1_fold1_{}.pth'.format(MODEL_NAME,epoch+1))
        torch.save(model.state_dict(), COMMON_MODEL_PATH)
        if pred1 > best_pred1:
            best_pred1 = max(pred1, best_pred1)
            torch.save(model.state_dict(), BEST_MODEL_PATH)
            log_dict(epoch+1,best_pred1)
        log('Epoch:{}\tpred1:{}\tBest_pred1:{}\n'.format(epoch+1,pred1,best_pred1))
Example #8
0
    def __init__(self,
                 args,
                 model,
                 optimizer,
                 train_loader,
                 val_loader,
                 input_train_transform,
                 input_val_transform,
                 output_train_transform,
                 output_val_transform,
                 losses,
                 scheduler=None):

        # Allow multiple processes to access tensors on GPU. Add checking for multiple continuous runs.
        if multiprocessing.get_start_method(allow_none=True) is None:
            multiprocessing.set_start_method(method='spawn')

        self.logger = get_logger(name=__name__,
                                 save_file=args.log_path / args.run_name)

        # Checking whether inputs are correct.
        assert isinstance(model,
                          nn.Module), '`model` must be a Pytorch Module.'
        assert isinstance(
            optimizer,
            optim.Optimizer), '`optimizer` must be a Pytorch Optimizer.'
        assert isinstance(train_loader, DataLoader) and isinstance(val_loader, DataLoader), \
            '`train_loader` and `val_loader` must be Pytorch DataLoader objects.'

        assert callable(input_train_transform) and callable(input_val_transform), \
            'input_transforms must be callable functions.'
        # I think this would be best practice.
        assert isinstance(output_train_transform, nn.Module) and isinstance(output_val_transform, nn.Module), \
            '`output_train_transform` and `output_val_transform` must be Pytorch Modules.'

        # 'losses' is expected to be a dictionary.
        # Even composite losses should be a single loss module with a tuple as its output.
        losses = nn.ModuleDict(losses)

        if scheduler is not None:
            if isinstance(scheduler, optim.lr_scheduler.ReduceLROnPlateau):
                self.metric_scheduler = True
            elif isinstance(scheduler, optim.lr_scheduler._LRScheduler):
                self.metric_scheduler = False
            else:
                raise TypeError(
                    '`scheduler` must be a Pytorch Learning Rate Scheduler.')

        # Display interval of 0 means no display of validation images on TensorBoard.
        if args.max_images <= 0:
            self.display_interval = 0
        else:
            self.display_interval = int(
                len(val_loader.dataset) // (args.max_images * args.batch_size))

        self.manager = CheckpointManager(model,
                                         optimizer,
                                         mode='min',
                                         save_best_only=args.save_best_only,
                                         ckpt_dir=args.ckpt_path,
                                         max_to_keep=args.max_to_keep)

        # loading from checkpoint if specified.
        if vars(args).get('prev_model_ckpt'):
            self.manager.load(load_dir=args.prev_model_ckpt,
                              load_optimizer=False)

        self.model = model
        self.optimizer = optimizer
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.input_train_transform = input_train_transform
        self.input_val_transform = input_val_transform
        self.output_train_transform = output_train_transform
        self.output_val_transform = output_val_transform
        self.losses = losses
        self.scheduler = scheduler
        self.writer = SummaryWriter(str(args.log_path))

        self.img_lambda = torch.tensor(args.img_lambda,
                                       dtype=torch.float32,
                                       device=args.device)
        self.phase_lambda = torch.tensor(args.phase_lambda,
                                         dtype=torch.float32,
                                         device=args.device)
        self.verbose = args.verbose
        self.num_epochs = args.num_epochs
        self.smoothing_factor = args.smoothing_factor
        self.shrink_scale = args.shrink_scale
        self.use_slice_metrics = args.use_slice_metrics

        # This part should get SSIM, not 1 - SSIM.
        self.ssim = SSIM(filter_size=7).to(
            device=args.device)  # Needed to cache the kernel.

        # Logging all components of the Model Trainer.
        # Train and Val input and output transforms are assumed to use the same input transform class.
        self.logger.info(f'''
        Summary of Model Trainer Components:
        Model: {get_class_name(model)}.
        Optimizer: {get_class_name(optimizer)}.
        Input Transforms: {get_class_name(input_val_transform)}.
        Output Transform: {get_class_name(output_val_transform)}.
        Phase (Angle) Loss: {get_class_name(losses["phase_loss"])}.
        Image Domain Loss: {get_class_name(losses['img_loss'])}.
        X (Phase and Magnitude) Loss: {get_class_name(losses['x_loss'])}. 
        Learning-Rate Scheduler: {get_class_name(scheduler)}.
        ''')
Example #9
0
    base_env = env_creator()
    base_env.reset()
    base_env_dict = RailEnvPersister.get_full_state(env=base_env)

    controller_arguments = {
        "model": model,
        "action_size": 5,
    }

    controller_creator = partial(DQNController, **controller_arguments)
    master_controller = controller_creator()

    if multiprocess:
        try:
            print(f"Distributed available: {distributed.is_available()}")
            set_start_method("spawn")
            master_controller.model.share_memory()
        except Exception as e:
            print(f"Could not share memory: {e}")
    for epoch in range(epochs):
        pool_start = time()
        # Create pickable episode kwargs
        episodes_kwargs = [{
            "env_dict": base_env_dict,
            "obs_builder": TreeObsForRailEnv(2),
            "controller_creator": controller_creator,
            "max_episode_length": 1000,
            "render": False,
            "episode_id": epoch * episodes + episode
        } for episode in range(episodes)]
        epoch_trajectories = []
Example #10
0
                         torch.cat(actions), np.asarray(rewards))
        eploss += loss.item()
        shared_optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 40)

        for param, shared_param in zip(model.parameters(),
                                       shared_model.parameters()):
            if shared_param.grad is None:
                shared_param._grad = param.grad  # sync gradients with shared model
        shared_optimizer.step()


if __name__ == "__main__":
    if sys.version_info[0] > 2:
        mp.set_start_method('spawn')  # this must not be in global scope
    elif sys.platform == 'linux' or sys.platform == 'linux2':
        raise "Must be using Python 3 with linux!"  # or else you get a deadlock in conv2d

    args = get_args()
    args.save_dir = '{}/'.format(
        args.env.lower())  # keep the directory structure simple
    if args.render:
        args.processes = 1
        args.test = True  # render mode -> test mode w one process
    if args.test: args.lr = 0  # don't train in render mode
    args.num_actions = gym.make(
        args.env).action_space.n  # get the action space of this game
    os.makedirs(args.save_dir) if not os.path.exists(
        args.save_dir) else None  # make dir to save models etc.
Example #11
0
            # write & record out_paths
            data_paths_i.append([])
            for tr in st:
                out_path = os.path.join(event_dir,
                                        '%s.%s' % (net_sta, tr.stats.channel))
                tr.stats.sac.t0, tr.stats.sac.t1 = tp - start_time, ts - start_time
                tr.write(out_path, format='sac')
                data_paths_i[-1].append(out_path)
        return data_paths_i

    def __len__(self):
        return len(self.sta_date_items)


if __name__ == '__main__':
    mp.set_start_method('spawn', force=True)  # 'spawn' or 'forkserver'
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_dir', type=str, default='/data/Example_data')
    parser.add_argument('--temp_pha', type=str, default='input/example.temp')
    parser.add_argument('--out_root',
                        type=str,
                        default='output/example_templates')
    args = parser.parse_args()

    # read fpha
    temp_list = read_ftemp(args.temp_pha)
    sta_date_dict = get_sta_date(temp_list)
    sta_date_items = list(sta_date_dict.items())
    # for sta-date pairs
    data_paths = []
    dataset = Cut_Templates(sta_date_items)
Example #12
0
    def search(self, train_data, test_data, timeout=60 * 60 * 24):
        start_time = time.time()
        torch.cuda.empty_cache()
        if not self.history:
            self.init_search()

        # Start the new process for training.
        graph, father_id, model_id = self.training_queue.pop(0)
        if self.verbose:
            print('\n')
            print('+' + '-' * 46 + '+')
            print('|' + 'Training model {}'.format(model_id).center(46) + '|')
            print('+' + '-' * 46 + '+')
        mp.set_start_method('spawn', force=True)
        pool = mp.Pool(1)
        try:
            train_results = pool.map_async(train, [(graph, train_data, test_data, self.trainer_args,
                                                    os.path.join(self.path, str(model_id) + '.png'),
                                                    self.metric, self.loss, self.verbose, self.path)])

            # Do the search in current thread.
            searched = False
            new_graph = None
            new_father_id = None
            if not self.training_queue:
                searched = True

                while new_father_id is None:
                    remaining_time = timeout - (time.time() - start_time)
                    new_graph, new_father_id = self.bo.optimize_acq(self.search_tree.adj_list.keys(),
                                                                    self.descriptors,
                                                                    remaining_time)
                new_model_id = self.model_count
                self.model_count += 1
                self.training_queue.append((new_graph, new_father_id, new_model_id))
                self.descriptors.append(new_graph.extract_descriptor())

            remaining_time = timeout - (time.time() - start_time)
            if remaining_time <= 0:
                raise TimeoutError
            metric_value, loss, graph = train_results.get(timeout=remaining_time)[0]

            if self.verbose and searched:
                verbose_print(new_father_id, new_graph)

            self.add_model(metric_value, loss, graph, model_id)
            self.search_tree.add_child(father_id, model_id)
            self.bo.fit(self.x_queue, self.y_queue)
            self.x_queue = []
            self.y_queue = []

            pickle_to_file(self, os.path.join(self.path, 'searcher'))
            self.export_json(os.path.join(self.path, 'history.json'))

        except (mp.TimeoutError, TimeoutError) as e:
            raise TimeoutError from e
        except RuntimeError as e:
            if not re.search('out of memory', str(e)):
                raise e
            if self.verbose:
                print('\nCurrent model size is too big. Discontinuing training this model to search for other models.')
            Constant.MAX_MODEL_SIZE = graph.size() - 1
            return
        finally:
            # terminate and join the subprocess to prevent any resource leak
            pool.terminate()
            pool.close()
            pool.join()
Example #13
0
    def _further_initialization(self):

        if self.dense_mode:
            Settings().dense_mode = self.dense_mode
            print(
                '>> Dense mode activated. No distinction will be made between template and control points.'
            )
            assert len(self.template_specifications) == 1, \
                'Only a single object can be considered when using the dense mode.'
            if not self.freeze_control_points:
                self.freeze_control_points = True
                msg = 'With active dense mode, the freeze_template (currently %s) and freeze_control_points ' \
                      '(currently %s) flags are redundant. Defaulting to freeze_control_points = True.' \
                      % (str(self.freeze_template), str(self.freeze_control_points))
                warnings.warn(msg)
            if self.initial_control_points is not None:
                self.initial_control_points = None
                msg = 'With active dense mode, specifying initial_control_points is useless. Ignoring this xml entry.'
                warnings.warn(msg)

        if self.initial_cp_spacing < 0 and self.initial_control_points is None and not self.dense_mode:
            print(
                '>> No initial CP spacing given: using diffeo kernel width of '
                + str(self.deformation_kernel_width))
            self.initial_cp_spacing = self.deformation_kernel_width

        # Setting tensor types according to CUDA availability and user choices.
        if self._cuda_is_used:
            if not torch.cuda.is_available():
                msg = 'CUDA seems to be unavailable. All computations will be carried out on CPU.'
                warnings.warn(msg)
            else:
                print(
                    ">> CUDA is used at least in one operation, all operations will be done with FLOAT precision."
                )
                if self.use_cuda:
                    print(">> All tensors will be CUDA tensors.")
                    Settings().tensor_scalar_type = torch.cuda.FloatTensor
                    Settings().tensor_integer_type = torch.cuda.LongTensor
                else:
                    print(">> Setting tensor type to float.")
                    Settings().tensor_scalar_type = torch.FloatTensor

        # Setting the dimension.
        Settings().dimension = self.dimension

        # If longitudinal model and t0 is not initialized, initializes it.
        if (self.model_type == 'regression' or self.model_type == 'LongitudinalAtlas'.lower()
            or self.model_type == 'LongitudinalRegistration'.lower()) \
                and (self.t0 is None or self.initial_time_shift_variance is None):
            total_number_of_visits = 0
            mean_visit_age = 0.0
            var_visit_age = 0.0
            for i in range(len(self.visit_ages)):
                for j in range(len(self.visit_ages[i])):
                    total_number_of_visits += 1
                    mean_visit_age += self.visit_ages[i][j]
                    var_visit_age += self.visit_ages[i][j]**2

            if total_number_of_visits > 0:
                mean_visit_age /= float(total_number_of_visits)
                var_visit_age = (
                    var_visit_age / float(total_number_of_visits) -
                    mean_visit_age**2)

                if self.t0 is None:
                    print('>> Initial t0 set to the mean visit age: %.2f' %
                          mean_visit_age)
                    self.t0 = mean_visit_age
                else:
                    print(
                        '>> Initial t0 set by the user to %.2f ; note that the mean visit age is %.2f'
                        % (self.t0, mean_visit_age))

                if not self.model_type == 'regression':
                    if self.initial_time_shift_variance is None:
                        print(
                            '>> Initial time-shift std set to the empirical std of the visit ages: %.2f'
                            % math.sqrt(var_visit_age))
                        self.initial_time_shift_variance = var_visit_age
                    else:
                        print((
                            '>> Initial time-shift std set by the user to %.2f ; note that the empirical std of '
                            'the visit ages is %.2f') %
                              (self.initial_time_shift_variance,
                               math.sqrt(var_visit_age)))

        # Setting the number of threads in general settings
        Settings().number_of_threads = self.number_of_threads
        if self.number_of_threads > 1:
            print(
                ">> I will use", self.number_of_threads,
                "threads, and I set OMP_NUM_THREADS and torch_num_threads to 1."
            )
            os.environ['OMP_NUM_THREADS'] = "1"
            torch.set_num_threads(1)
        else:
            print('>> Setting OMP_NUM_THREADS and torch_num_threads to 4.')
            os.environ['OMP_NUM_THREADS'] = "4"
            torch.set_num_threads(4)

            try:
                set_start_method("spawn")
            except RuntimeError as error:
                print('>> Warning: ' + str(error) +
                      ' [ in xml_parameters ]. Ignoring.')

        self._initialize_state_file()

        # Freeze the fixed effects in case of a registration.
        if self.model_type == 'Registration'.lower():
            self.freeze_template = True
            self.freeze_control_points = True

        elif self.model_type == 'LongitudinalRegistration'.lower():
            self.freeze_template = True
            self.freeze_control_points = True
            self.freeze_momenta = True
            self.freeze_modulation_matrix = True
            self.freeze_reference_time = True
            self.freeze_time_shift_variance = True
            self.freeze_log_acceleration_variance = True
            self.freeze_noise_variance = True

        # Initialize the number of sources if needed.
        if self.model_type == 'LongitudinalAtlas'.lower() \
                and self.initial_modulation_matrix is None and self.number_of_sources is None:
            self.number_of_sources = 4
            print(
                '>> No initial modulation matrix given, neither a number of sources. '
                'The latter will be ARBITRARILY defaulted to 4.')

        if self.dimension <= 1:
            print(
                "Setting the number of sources to 0 because the dimension is 1."
            )
            self.number_of_sources = 0

        # Initialize the initial_log_acceleration_variance if needed.
        if (self.model_type == 'LongitudinalAtlas'.lower() or self.model_type == 'LongitudinalRegistration'.lower()) \
                and self.initial_log_acceleration_variance is None:
            print(
                '>> The initial log-acceleration std fixed effect is ARBITRARILY set to 0.5'
            )
            log_acceleration_std = 0.5
            self.initial_log_acceleration_variance = (log_acceleration_std**2)

        # Image grid downsampling factor.
        if not self.downsampling_factor == 1:
            image_object_specs = [
                (key, value)
                for key, value in self.template_specifications.items()
                if value['deformable_object_type'].lower() == 'image'
            ]
            if len(image_object_specs) > 2:
                raise RuntimeError('Only a single image object can be used.')
            elif len(image_object_specs) == 1:
                print('>> Setting the image grid downsampling factor to: %d.' %
                      self.downsampling_factor)
                self.template_specifications[image_object_specs[0][0]][
                    'downsampling_factor'] = self.downsampling_factor
            else:
                msg = 'The "downsampling_factor" parameter is useful only for image data, ' \
                      'but none is considered here. Ignoring.'
                warnings.warn(msg)
Example #14
0
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

if __name__ == '__main__':
    args = parser.parse_args()

    use_cuda = args.cuda and torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    dataloader_kwargs = {'pin_memory': True} if use_cuda else {}

    torch.manual_seed(args.seed)
    mp.set_start_method('spawn')

    model = Net().to(device)
    model.share_memory() # gradients are allocated lazily, so they are not shared here

    processes = []
    for rank in range(args.num_processes):
        p = mp.Process(target=train, args=(rank, args, model, device, dataloader_kwargs))
        # We first train the model across `num_processes` processes
        p.start()
        processes.append(p)
    for p in processes:
        p.join()

    # Once training is complete, we can test the model
    test(args, model, device, dataloader_kwargs)
Example #15
0
def experiment(args):

    device = torch.device("cuda:{}".format(args.device) if args.cuda else "cpu")

    env = get_env( params['env_name'], params['env'])

    env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    if args.cuda:
        torch.backends.cudnn.deterministic=True
    
    buffer_param = params['replay_buffer']

    experiment_name = os.path.split( os.path.splitext( args.config )[0] )[-1] if args.id is None \
        else args.id
    logger = Logger( experiment_name , params['env_name'], args.seed, params, args.log_dir )

    params['general_setting']['env'] = env
    params['general_setting']['logger'] = logger
    params['general_setting']['device'] = device

    params['net']['base_type']=networks.MLPBase

    import torch.multiprocessing as mp
    mp.set_start_method('spawn')

    pf = policies.GuassianContPolicy (
        input_shape = env.observation_space.shape[0], 
        output_shape = 2 * env.action_space.shape[0],
        **params['net'] )
    qf1 = networks.FlattenNet( 
        input_shape = env.observation_space.shape[0] + env.action_space.shape[0],
        output_shape = 1,
        **params['net'] )

    qf2 = networks.FlattenNet( 
        input_shape = env.observation_space.shape[0] + env.action_space.shape[0],
        output_shape = 1,
        **params['net'] )

    # pretrain_pf = policies.UniformPolicyContinuous(env.action_space.shape[0])
    
    example_ob = env.reset()
    example_dict = { 
        "obs": example_ob,
        "next_obs": example_ob,
        "acts": env.action_space.sample(),
        "rewards": [0],
        "terminals": [False]
    }
    replay_buffer = AsyncSharedReplayBuffer( int(buffer_param['size']),
            args.worker_nums
    )
    replay_buffer.build_by_example(example_dict)

    params['general_setting']['replay_buffer'] = replay_buffer

    epochs = params['general_setting']['pretrain_epochs'] + \
        params['general_setting']['num_epochs']
    print(epochs)
    params['general_setting']['collector'] = AsyncParallelCollector(
        env, pf, replay_buffer,
        get_env, {
            "env_id": params['env_name'],
            "env_param": params['env']},
        device=device,
        worker_nums=args.worker_nums, eval_worker_nums= args.eval_worker_nums,
        train_epochs = epochs,
        eval_epochs= params['general_setting']['num_epochs']
    )

    params['general_setting']['save_dir'] = osp.join(logger.work_dir,"model")
    agent = TwinSACQ(
        pf = pf,
        qf1 = qf1,
        qf2 = qf2,
        **params['sac'],
        **params['general_setting']
    )
    agent.train()
Example #16
0
    def meta_fit(self, meta_dataset_generator):

        catchable_sigs = set(signal.Signals) - {signal.SIGKILL, signal.SIGSTOP}
        for sig in catchable_sigs:
            signal.signal(
                sig,
                receive_signal)  # Substitute handler of choice for `print`
        LOGGER.debug('My PID: %s' % os.getpid())

        self.timer.begin('main training')
        mp.set_start_method('spawn', force=True)

        # >>> BUG: OS Error: Too many opened files
        # >>> SOLVED: by `ulimit -HSn 4096`
        # Now, we change all the queues to pipe
        self.timer.begin('build data pipeline')
        # every 10 epoch will produce one valid
        train_data_reservoir = [
            queue.Queue(32 * 10) for i in range(len(self.devices))
        ]
        valid_data_reservoir = [
            queue.Queue(200) for i in range(len(self.devices))
        ]
        meta_valid_reservoir = [
            queue.Queue(self.eval_tasks) for i in range(self.total_exp)
        ]
        train_recv, valid_recv = [], []
        train_send, valid_send = [], []
        for i in range(len(self.devices)):
            recv, send = Pipe(True)
            # activate the first handshake
            recv.send(True)
            train_recv.append(recv)
            train_send.append(send)
            recv, send = Pipe(True)
            # activate the first handshake
            recv.send(True)
            valid_recv.append(recv)
            valid_send.append(send)

        def apply_device_to_hp(hp, device):
            hp['device'] = 'cuda:{}'.format(device)
            return hp

        self.timer.end('build data pipeline')

        self.timer.begin('build main proc pipeline')
        clsnum = get_base_class_number(meta_dataset_generator)
        LOGGER.info('base class number detected', clsnum)
        procs = [
            mp.Process(target=run_exp,
                       args=(self.modules[i].MyMetaLearner,
                             apply_device_to_hp(self.hp[i], dev),
                             train_recv[i], valid_recv[i], clsnum))
            for i, dev in enumerate(self.devices)
        ]
        for p in procs:
            p.daemon = True
            p.start()

        self.timer.end('build main proc pipeline')
        LOGGER.info('build data',
                    self.timer.query_time_by_name('build data pipeline'),
                    'build proc',
                    self.timer.query_time_by_name('build main proc pipeline'))
        label_meta_valid = []

        data_generation = True

        self.timer.begin('prepare dataset')
        meta_train_dataset = meta_dataset_generator.meta_train_pipeline.batch(
            1)
        meta_train_generator = cycle(iter(meta_train_dataset))
        meta_valid_dataset = meta_dataset_generator.meta_valid_pipeline.batch(
            1)
        meta_valid_generator = cycle(iter(meta_valid_dataset))
        self.timer.end('prepare dataset')
        LOGGER.info('prepare dataset',
                    self.timer.query_time_by_name('prepare dataset'))

        valid_ens_data_load_number = 0

        def generate_data():
            # manage data globally
            while data_generation:
                for i in range(32 * 10):
                    # load train
                    if not data_generation:
                        break
                    data_train = process_task_batch(next(meta_train_generator),
                                                    device=torch.device('cpu'),
                                                    with_origin_label=True)
                    for dr in train_data_reservoir:
                        try:
                            dr.put_nowait(data_train)
                        except:
                            pass
                    time.sleep(0.0001)

                for i in range(200):
                    # load valid
                    if not data_generation:
                        break
                    data_valid = process_task_batch(next(meta_valid_generator),
                                                    device=torch.device('cpu'),
                                                    with_origin_label=False)
                    for dr in valid_data_reservoir:
                        try:
                            dr.put_nowait(data_valid)
                        except:
                            pass
                    if random.random() < 0.1:
                        for dr in meta_valid_reservoir:
                            try:
                                if dr.qsize() < self.eval_tasks:
                                    valid_ens_data_load_number += 1
                                    dr.put_nowait([
                                        data_valid[0][0], data_valid[0][1],
                                        data_valid[1][0]
                                    ])
                                    label_meta_valid.extend(
                                        data_valid[1][1].tolist())
                            except:
                                pass
                    time.sleep(0.0001)

        def put_data_train_passive(i):
            while data_generation:
                try:
                    if train_send[i].recv():
                        supp, quer = train_data_reservoir[i].get()
                        data = self.modules[i].process_data(
                            supp, quer, True, self.hp[i])
                        train_send[i].send(data)
                    else:
                        return
                except:
                    pass

        def put_data_valid_passive(i):
            while data_generation:
                try:
                    if valid_send[i].recv():
                        supp, quer = valid_data_reservoir[i].get()
                        data = self.modules[i].process_data(
                            supp, quer, False, self.hp[i])
                        valid_send[i].send(data)
                    else:
                        return
                except:
                    pass

        thread_pool = [threading.Thread(target=generate_data)] + \
            [threading.Thread(target=put_data_train_passive, args=(i,)) for i in range(self.total_exp)] + \
            [threading.Thread(target=put_data_valid_passive, args=(i,)) for i in range(self.total_exp)]

        for th in thread_pool:
            th.daemon = True
            th.start()

        try:
            # we leave about 20 min for decoding of test
            for p in procs:
                p.join(max(self.timer.time_left() - 60 * 10, 0.1))

            self.timer.begin('clear env')
            # terminate proc that is out-of-time
            LOGGER.info('Main meta-train is done',
                        '' if self.timer.time_left() > 60 else 'time out exit')
            LOGGER.info('time left', self.timer.time_left(), 's')
            for p in procs:
                if p.is_alive():
                    p.terminate()

            data_generation = False
            # in case there are blocking
            for q in train_data_reservoir + valid_data_reservoir:
                if q.empty():
                    q.put(False)
            for s in train_recv + valid_recv:
                s.send(False)
            for s in train_send + train_recv + valid_send + valid_recv:
                s.close()
            for p in thread_pool:
                p.join()
            self.timer.end('clear env')
            LOGGER.info('clear env',
                        self.timer.query_time_by_name('clear env'))

            self.timer.end('main training')
        except Exception:
            LOGGER.info('error occured in main process')
            traceback.print_exc()

        LOGGER.info(
            'spawn total {} meta valid tasks. main training time {}'.format(
                valid_ens_data_load_number,
                self.timer.query_time_by_name('main training')))

        self.timer.begin('load learner')

        self.meta_learners = [None] * self.total_exp

        def load_model(args):
            module, hp, i = args
            self.meta_learners[i] = module.load_model(hp)

        pool = [
            threading.Thread(target=load_model,
                             args=((self.modules[i], self.hp[i], i), ))
            for i in range(self.total_exp)
        ]
        for p in pool:
            p.daemon = True
            p.start()
        for p in pool:
            p.join()

        self.timer.end('load learner')
        LOGGER.info('load learner done, time spent',
                    self.timer.query_time_by_name('load learner'))

        if not isinstance(self.ensemble, int):
            # instead of just weighted sum, we plan to use stacking
            procs = []
            reses = [None] * len(self.meta_learners)

            self.timer.begin('validation')

            recv_list, sent_list = [], []
            for i in range(self.total_exp):
                r, s = Pipe(True)
                r.send(True)
                recv_list.append(r)
                sent_list.append(s)

            pool = mp.Pool(self.total_exp)
            procs = pool.starmap_async(
                predict, [(self.meta_learners[i], recv_list[i],
                           self.eval_tasks, self.hp[i]['device'], {
                               'time_fired': time.time(),
                               'taskid': i
                           }) for i in range(self.total_exp)])

            # start sub thread to pass data
            def pass_meta_data(i):
                for _ in range(self.eval_tasks):
                    if sent_list[i].recv():
                        # LOGGER.info(i, 'fire data signal get')
                        sent_list[i].send(meta_valid_reservoir[i].get())
                        # LOGGER.info(i, 'data is sent')

            threads = [
                threading.Thread(target=pass_meta_data, args=(i, ))
                for i in range(self.total_exp)
            ]
            for t in threads:
                t.daemon = True
                t.start()

            for _ in range(self.eval_tasks - valid_ens_data_load_number):
                data_valid = next(meta_valid_generator)
                data_valid = process_task_batch(data_valid,
                                                device=torch.device('cpu'),
                                                with_origin_label=False)
                label_meta_valid.extend(data_valid[1][1].tolist())
                for dr in meta_valid_reservoir:
                    dr.put(
                        [data_valid[0][0], data_valid[0][1], data_valid[1][0]])
                # LOGGER.info('put data!')
            # LOGGER.info('all data done!')

            # now we can receive data
            for t in threads:
                t.join()
            reses = [sent_list[i].recv()['res'] for i in range(self.total_exp)]
            # every res in reses is a np.array of shape (eval_task * WAY * QUERY) * WAY
            ENS_VALID_TASK = 50
            ENS_VALID_ELEMENT = ENS_VALID_TASK * 5 * 19
            reses_test_list = [
                deepcopy(res[-ENS_VALID_ELEMENT:]) for res in reses
            ]

            self.timer.end('validation')
            LOGGER.info('valid data predict done',
                        self.timer.query_time_by_name('validation'))

            weight = [1.] * len(self.meta_learners)
            labels = np.array(label_meta_valid, dtype=np.int)  # 19000
            acc_o = ((np.array(weight)[:, None, None] / sum(weight) *
                      np.array(reses)).sum(axis=0).argmax(
                          axis=1) == labels).mean()
            reses = np.array(reses, dtype=np.float).transpose((1, 0, 2))
            reses_test = reses[-ENS_VALID_ELEMENT:].reshape(
                ENS_VALID_ELEMENT, -1)
            reses = reses[:-ENS_VALID_ELEMENT]
            reses = reses.reshape(len(reses), -1)
            labels_test = labels[-ENS_VALID_ELEMENT:]
            labels = labels[:-ENS_VALID_ELEMENT]
            LOGGER.info('voting result', acc_o)

            self.timer.begin('ensemble')

            # mp.set_start_method('fork', True)
            result = pool.map(
                ensemble_on_data,
                [
                    (GBMEnsembler(), reses, labels, 'gbm'),
                    (GLMEnsembler(), reses, labels, 'glm'),
                    (NBEnsembler(), reses, labels, 'nb'),
                    (RFEnsembler(), reses, labels, 'rf'
                     )  # too over-fit on simple dataset
                ])

            # test the ensemble model
            def acc(logit, label):
                return (logit.argmax(axis=1) == label).mean()

            res_test = [x[0]._predict(reses_test) for x in result]
            acc_test = [acc(r, labels_test) for r in res_test]
            acc_single_test = [
                acc(np.array(r), labels_test) for r in reses_test_list
            ]
            LOGGER.info('ensemble test', 'gbm', 'glm', 'nb', 'rf', acc_test)
            LOGGER.info('single test', acc_single_test)

            if max(acc_test) > max(acc_single_test):
                LOGGER.info("will use ensemble model")
                #idx_acc_max = np.argmax([x[1] for x in result])
                idx_acc_max = np.argmax(acc_test)
                self.timer.end('ensemble')
                print('best ensembler', ['gbm', 'glm', 'nb',
                                         'rf'][idx_acc_max], 'acc',
                      acc_test[idx_acc_max])
                print('ensemble done, time cost',
                      self.timer.query_time_by_name('ensemble'))

                # currently we use mean of output as ensemble
                return MyLearner(self.meta_learners,
                                 result[idx_acc_max][0],
                                 timers=self.timer)
            else:
                LOGGER.info("will use single model")
                idx_acc_max = np.argmax(acc_single_test)
                self.timer.end('ensemble')
                print('best single model id', idx_acc_max)
                print('ensemble done, time cost',
                      self.timer.query_time_by_name('ensemble'))

                # return only the best meta learners
                return MyLearner([self.meta_learners[idx_acc_max]], 0,
                                 self.timer)
        return MyLearner([self.meta_learners[self.ensemble]],
                         0,
                         timers=self.timer)
Example #17
0
def automate_training(config,
                      param,
                      fixed_split,
                      all_combin,
                      n_iterations=1,
                      run_test=False,
                      all_logs=False,
                      thr_increment=None):
    """Automate multiple training processes on multiple GPUs.

    Hyperparameter optimization of models is tedious and time-consuming. This function automatizes this optimization
    across multiple GPUs. It runs trainings, on the same training and validation datasets, by combining a given set of
    parameters and set of values for each of these parameters. Results are collected for each combination and reported
    into a dataframe to allow their comparison. The script efficiently allocates each training to one of the available
    GPUs.

    Usage example::

        ivadomed_automate_training -c config.json -p params.json -n n_iterations

    .. csv-table:: Example of dataframe
       :file: ../../images/detailed_results.csv

    Args:
        config (string): Configuration filename, which is used as skeleton to configure the training. Some of its
            parameters (defined in `param` file) are modified across experiments. Flag: ``--config``, ``-c``
        param (string): json file containing parameters configurations to compare. Parameter "keys" of this file
            need to match the parameter "keys" of `config` file. Parameter "values" are in a list. Flag: ``--param``, ``-p``

            Example::

                {"default_model": {"depth": [2, 3, 4]}}

        fixed_split (bool): If True, all the experiments are run on the same training/validation/testing subdatasets.
                            Flag: ``--fixed-split``
        all_combin (bool): If True, all parameters combinations are run. Flag: ``--all-combin``
        n_iterations (int): Controls the number of time that each experiment (ie set of parameter) are run.
                            Flag: ``--n-iteration``, ``-n``
        run_test (bool): If True, the trained model is also run on the testing subdataset. flag: ``--run-test``
        all_logs (bool): If True, all the log directories are kept for every iteration. Flag: ``--all-logs``, ``-l``
        thr_increment (float): A threshold analysis is performed at the end of the training using the trained model and
            the validation sub-dataset to find the optimal binarization threshold. The specified value indicates the
            increment between 0 and 1 used during the ROC analysis (e.g. 0.1). Flag: ``-t``, ``--thr-increment``
    """
    # Load initial config
    initial_config = imed_config_manager.ConfigurationManager(
        config).get_config()

    # Hyperparameters values to experiment
    with open(param, "r") as fhandle:
        hyperparams = json.load(fhandle)
    param_dict, names_dict = {}, {}
    for category in hyperparams.keys():
        assert category in initial_config
        base_item = initial_config[category]
        keys = list(hyperparams[category].keys())
        values = [hyperparams[category][k] for k in keys]
        new_parameters, names = make_category(base_item, keys, values,
                                              all_combin)
        param_dict[category] = new_parameters
        names_dict[category] = names

    # Split dataset if not already done
    if fixed_split and (initial_config.get("split_path") is None):
        train_lst, valid_lst, test_lst = imed_loader_utils.get_new_subject_split(
            path_folder=initial_config["loader_parameters"]["bids_path"],
            center_test=initial_config["split_dataset"]["center_test"],
            split_method=initial_config["split_dataset"]["method"],
            random_seed=initial_config["split_dataset"]["random_seed"],
            train_frac=initial_config["split_dataset"]["train_fraction"],
            test_frac=initial_config["split_dataset"]["test_fraction"],
            log_directory="./",
            balance=initial_config["split_dataset"]['balance']
            if 'balance' in initial_config["split_dataset"] else None)

        # save the subject distribution
        split_dct = {'train': train_lst, 'valid': valid_lst, 'test': test_lst}
        split_path = "./" + "common_split_datasets.joblib"
        joblib.dump(split_dct, split_path)
        initial_config["split_dataset"]["fname_split"] = split_path

    config_list = []
    # Test all combinations (change multiple parameters for each test)
    if all_combin:

        # Cartesian product (all combinations)
        combinations = (dict(zip(param_dict.keys(), values))
                        for values in product(*param_dict.values()))
        names = list(product(*names_dict.values()))

        for idx, combination in enumerate(combinations):

            new_config = copy.deepcopy(initial_config)

            for i, param in enumerate(combination):
                value = combination[param]
                new_config[param] = value
                new_config["log_directory"] = new_config[
                    "log_directory"] + names[idx][i]

            config_list.append(copy.deepcopy(new_config))
    # Change a single parameter for each test
    else:
        for param in param_dict:
            new_config = copy.deepcopy(initial_config)
            for value, name in zip(param_dict[param], names_dict[param]):
                new_config[param] = value
                new_config[
                    "log_directory"] = initial_config["log_directory"] + name
                config_list.append(copy.deepcopy(new_config))

    # CUDA problem when forking process
    # https://github.com/pytorch/pytorch/issues/2517
    mp.set_start_method('spawn')

    # Run all configs on a separate process, with a maximum of n_gpus  processes at a given time
    pool = mp.Pool(processes=len(initial_config["gpu"]))

    results_df = pd.DataFrame()
    eval_df = pd.DataFrame()
    all_mean = pd.DataFrame()
    for i in range(n_iterations):
        if not fixed_split:
            # Set seed for iteration
            seed = random.randint(1, 10001)
            for config in config_list:
                config["split_dataset"]["random_seed"] = seed
                if all_logs:
                    if i:
                        config["log_directory"] = config[
                            "log_directory"].replace(
                                "_n=" + str(i - 1).zfill(2),
                                "_n=" + str(i).zfill(2))
                    else:
                        config["log_directory"] += "_n=" + str(i).zfill(2)
        validation_scores = pool.map(
            partial(train_worker, thr_incr=thr_increment), config_list)
        val_df = pd.DataFrame(validation_scores,
                              columns=[
                                  'log_directory', 'best_training_dice',
                                  'best_training_loss', 'best_validation_dice',
                                  'best_validation_loss'
                              ])

        if run_test:
            new_config_list = []
            for config in config_list:
                # Delete path_pred
                path_pred = os.path.join(config['log_directory'], 'pred_masks')
                if os.path.isdir(path_pred) and n_iterations > 1:
                    try:
                        shutil.rmtree(path_pred)
                    except OSError as e:
                        print("Error: %s - %s." % (e.filename, e.strerror))

                # Take the config file within the log_directory because binarize_prediction may have been updated
                json_path = os.path.join(config['log_directory'],
                                         'config_file.json')
                new_config = imed_config_manager.ConfigurationManager(
                    json_path).get_config()
                new_config["gpu"] = config["gpu"]
                new_config_list.append(new_config)

            test_results = pool.map(test_worker, new_config_list)

            df_lst = []
            # Merge all eval df together to have a single excel file
            for j, result in enumerate(test_results):
                df = result[-1]

                if i == 0:
                    all_mean = df.mean(axis=0)
                    std_metrics = df.std(axis=0)
                    metrics = pd.concat([all_mean, std_metrics],
                                        sort=False,
                                        axis=1)
                else:
                    all_mean = pd.concat([all_mean, df.mean(axis=0)],
                                         sort=False,
                                         axis=1)
                    mean_metrics = all_mean.mean(axis=1)
                    std_metrics = all_mean.std(axis=1)
                    metrics = pd.concat([mean_metrics, std_metrics],
                                        sort=False,
                                        axis=1)

                metrics.rename({0: "mean"}, axis=1, inplace=True)
                metrics.rename({1: "std"}, axis=1, inplace=True)
                id = result[0].split("_n=")[0]
                cols = metrics.columns.values
                for idx, col in enumerate(cols):
                    metrics.rename({col: col + "_" + id}, axis=1, inplace=True)
                df_lst.append(metrics)
                test_results[j] = result[:2]

            # Init or add eval results to dataframe
            eval_df = pd.concat(df_lst, sort=False, axis=1)

            test_df = pd.DataFrame(test_results,
                                   columns=['log_directory', 'test_dice'])
            combined_df = val_df.set_index('log_directory').join(
                test_df.set_index('log_directory'))
            combined_df = combined_df.reset_index()

        else:
            combined_df = val_df

        results_df = pd.concat([results_df, combined_df])
        results_df.to_csv("temporary_results.csv")
        eval_df.to_csv("average_eval.csv")

    # Merge config and results in a df
    config_df = pd.DataFrame.from_dict(config_list)
    keep = list(param_dict.keys())
    keep.append("log_directory")
    config_df = config_df[keep]

    results_df = config_df.set_index('log_directory').join(
        results_df.set_index('log_directory'))
    results_df = results_df.reset_index()
    results_df = results_df.sort_values(by=['best_validation_loss'])

    results_df.to_csv("detailed_results.csv")

    print("Detailed results")
    print(results_df)

    # Compute avg, std, p-values
    if n_iterations > 1:
        compute_statistics(results_df, n_iterations, run_test)
Example #18
0
    parser.add_argument('--ood_update_rounds', type=int, default=50)
    parser.add_argument('--ood_drop_lower', type=int, default=10)
    parser.add_argument('--ood_drop_upper', type=int, default=60)
    # Saving settings
    parser.add_argument('--save_freq', type=int, default=100)
    parser.add_argument('--exp_name', type=str, default='test')
    parser.add_argument('--save-dir', type=str, default="./experiments")
    parser.add_argument('--traj_dir', type=str, default="./experiments")
    parser.add_argument('--model_para', type=str, default="test_sac.torch")
    parser.add_argument('--cpc_para', type=str, default="test_cpc.torch")
    parser.add_argument('--numpy_para', type=str, default="test.numpy")
    parser.add_argument('--train_indicator', type=str, default="test.data")
    args = parser.parse_args()

    # Basic Settings
    mp.set_start_method("forkserver")
    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['MKL_NUM_THREADS'] = '1'
    torch.set_num_threads(torch.get_num_threads())
    experiment_dir = os.path.join(args.save_dir, args.exp_name)
    if not os.path.exists(experiment_dir):
        os.makedirs(experiment_dir)
    tensorboard_dir = os.path.join(experiment_dir, "runs")
    if not os.path.exists(tensorboard_dir):
        os.makedirs(tensorboard_dir)
    with open(os.path.join(experiment_dir, "arguments"), 'w') as f:
        json.dump(args.__dict__, f, indent=2)
    device = torch.device("cuda") if args.cuda else torch.device("cpu")
    # env and model setup
    ac_kwargs = dict(hidden_sizes=[args.hid] * args.l)
    # if args.exp_name == "test":
Example #19
0
    metavar='AM',
    help='Adam optimizer amsgrad parameter')
parser.add_argument('--skip-rate',
                    type=int,
                    default=4,
                    metavar='SR',
                    help='frame skip rate (default: 4)')

if __name__ == '__main__':
    args = parser.parse_args()
    torch.manual_seed(args.seed)
    if args.gpu_ids == -1:
        args.gpu_ids = [-1]
    else:
        torch.cuda.manual_seed(args.seed)
        mp.set_start_method('spawn')  #; multiprocrssing

    setup_json = read_config(args.env_config)
    env_conf = setup_json["Default"]
    for i in setup_json.keys():
        if i in args.env:
            env_conf = setup_json[i]

    env = atari_env(args.env, env_conf, args)
    shared_model = A3Clstm(env.observation_space.shape[0],
                           env.action_space)  # main A3C

    if args.load:  # if --load is True, load the .dat file.
        saved_state = torch.load('{0}{1}.dat'.format(args.load_model_dir,
                                                     args.env),
                                 map_location=lambda storage, loc: storage)
Example #20
0
def main(opt, Tracker_opt, Pose_opt, ReIDCfg, Num_Pred_opt):
    '''主函数'''

    main_timer = Timer()
    main_timer.tic()
    mp.set_start_method('spawn')
    logger.log(20, 'The pid of mian() : {}'.format(os.getpid()))
    logger.log(20, 'The thread of mian() : {}'.format(currentThread()))
    queueSize = 3

    IF_Restart = {
        'FMLoader': False,
        'Calibrate_transfer': False,
        'Alphapose': False,
        'SVHN_Predict': True
    }
    stack_index = get_intermediate_index(opt, IF_Restart)
    [S_Short_track, S_Coordinate_transfer, S_Pose_Estimate,
     S_Number_Predict] = stack_index

    # [S_Short_track, S_Coordinate_transfer, S_Pose_Estimate,S_Number_Predict] = [114,114,114,0]

    # 根据文件信息来进行追踪。
    Tracker_output_queue = mp.Queue(queueSize)
    # Short_Track 这个函数是用来短时徐追踪人的轨迹并输出ReID的
    Tracker = mp.Process(target=Short_Track,
                         args=(opt, Tracker_opt, Tracker_output_queue,
                               S_Short_track, S_Coordinate_transfer, queueSize,
                               True))
    Tracker.daemon = True
    Tracker.start()
    # Short_Track(opt,  Tracker_opt, Tracker_output_queue, S_Short_track, S_Coordinate_transfer)

    C_T_output_queue = mp.Queue(queueSize)  # C_T : coordinate transfer.
    # Coordinate_transfer(opt, Tracker_opt, Tracker_output_queue, C_T_output_queue,S_Coordinate_transfer,S_Pose_Estimate)
    # 基于追踪数据,将追踪数据转换到其他的视角,并且生成相应的截图何ReID Features
    C_transfer = mp.Process(target=Coordinate_transfer,
                            args=(opt, Tracker_opt, Tracker_output_queue,
                                  C_T_output_queue, S_Coordinate_transfer,
                                  S_Pose_Estimate, queueSize, True))
    C_transfer.daemon = True
    C_transfer.start()
    #
    #
    Pose_output_queue = mp.Queue(queueSize)
    # Pose_Estimate(opt, Pose_opt, C_T_output_queue, Pose_output_queue, S_Pose_Estimate, S_Number_Predict)
    # 前两步计算得到了一系列sub_imgs, 对这些 sub_imgs 做Pose_Estimate, 并做出相应的更改。
    P_estimate = mp.Process(target=Pose_Estimate,
                            args=(opt, Pose_opt, ReIDCfg, C_T_output_queue,
                                  Pose_output_queue, S_Pose_Estimate,
                                  S_Number_Predict, queueSize, True))
    P_estimate.daemon = True
    P_estimate.start()
    #
    Number_Predict(opt, Num_Pred_opt, ReIDCfg, Pose_output_queue,
                   S_Number_Predict, queueSize, True)
    # N_predict = mp.Process(target = Number_Predict, args=(opt, Num_Pred_opt, Pose_output_queue, S_Number_Predict,queueSize))
    # N_predict.daemon = True
    # N_predict.start()

    Tracker.join()
    logger.log(25, '----------------Finished tracker process----------------')
    C_transfer.join()
    logger.log(25,
               '----------------Finished C_transfer process----------------')
    P_estimate.join()
    logger.log(25,
               '----------------Finished P_estimate process----------------')
    # N_predict.join()
    logger.log(
        25, '----------------Finished Number_Predict process----------------')

    Total_time = main_timer.toc()
    Total_time = secs_to_clock(Total_time)
    logger.log(31, 'target dir : {}'.format(opt.dir_name))
    logger.log(31, 'main function consums {}'.format(Total_time))
Example #21
0
def main():
    import torch.multiprocessing as mp
    mp.set_start_method('spawn')
    import argparse
    arg_parser = argparse.ArgumentParser(
        description="Color training pipeline.")
    arg_parser.add_argument('-g', '--gpu', default='0', help='gpu id.')
    arg_parser.add_argument(
        "--checkpoint",
        "-c",
        dest="checkpoint",
        default="2000",
        help=
        'The checkpoint weights to use. This can be a number indicated an epoch or "latest" '
        + "for the latest weights (this is the default)",
    )
    arg_parser.add_argument('--test_step',
                            '-t',
                            type=int,
                            default=1,
                            help='test step.')
    arg_parser.add_argument(
        '--data_path',
        default='data/demo_multiview_real/',
        help='path to the dataset.')  #TODO need to change the default
    arg_parser.add_argument(
        '--obj_name',
        default='chairs',
        help=
        'deepsdf class model for experiments. (currently only support "chairs" '
    )
    arg_parser.add_argument('--visualize',
                            action='store_true',
                            help='visualization flag.')
    arg_parser.add_argument('--scale',
                            type=float,
                            default=0.2,
                            help='scale the size of input image.')

    args = arg_parser.parse_args()
    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)

    ################################
    num_views_per_round = 6
    num_epoch = 5
    sep_dist = 1
    refine_sim = True
    ini_mean_shape = False
    ################################

    # load data
    exp_dir = os.path.join('deepsdf/experiments/', args.obj_name)
    out_dir = 'vis/demo_multiview_real'

    upper_loader = LoaderMultiReal(args.data_path,
                                   scale=args.scale,
                                   refine_sim=refine_sim)

    for instance_num in range(len(upper_loader)):
        # load data
        instance_name, imgs, _, cameras, sim_mtrx_ini = upper_loader[
            instance_num]

        vis_dir = os.path.join(out_dir, '{}'.format(instance_num))
        os.makedirs(vis_dir, exist_ok=True)
        total_num_img = len(imgs)

        # RANDOMLY initialize shape code
        latent_size = 256
        std_ = 0.1
        if ini_mean_shape:
            shape_code = torch.zeros(1, latent_size)
        else:
            shape_code = torch.ones(1, latent_size).normal_(mean=0, std=std_)

        shape_code = shape_code.float().cuda()
        shape_code.requires_grad = True

        if refine_sim:
            sim3 = easydict.EasyDict({
                "scale":
                torch.tensor(0., requires_grad=True, device="cuda"),
                "rot":
                torch.tensor([0., 0., 0.], requires_grad=True, device="cuda"),
                "trans":
                torch.tensor([0., 0., 0.], requires_grad=True, device="cuda"),
            })
            optim_list = [{
                "params": [v for k, v in sim3.items()],
                "lr": LR
            }, {
                "params": [shape_code],
                "lr": LR
            }]
            optimizer_latent = torch.optim.Adam(optim_list)
        else:
            optimizer_latent = torch.optim.Adam([shape_code], lr=LR)
            sim3 = None

        decoder = load_decoder(exp_dir, args.checkpoint)
        decoder = decoder.module.cuda()
        img_h, img_w = imgs[0].shape[0], imgs[0].shape[1]
        img_hw = (img_h, img_w)
        print('Image size: {0}.'.format(img_hw))
        sdf_renderer = SDFRenderer_warp(decoder,
                                        cameras[0].intrinsic,
                                        march_step=200,
                                        buffer_size=1,
                                        threshold=THRESHOLD,
                                        transform_matrix=np.array(
                                            [[1., 0., 0.], [0., 1., 0.],
                                             [0., 0., 1.]]))
        evaluator = Evaluator(decoder)
        visualizer = Visualizer(img_hw)

        weight_list = {}
        weight_list['color'] = 5.0
        weight_list['l2reg'] = 1.0

        shape_code, optimizer_latent = optimize_multi_view(
            sdf_renderer,
            evaluator,
            shape_code,
            optimizer_latent,
            imgs,
            cameras,
            weight_list,
            num_views_per_round=num_views_per_round,
            num_iters=num_epoch,
            sep_dist=sep_dist,
            test_step=args.test_step,
            visualizer=visualizer,
            points_gt=None,
            sim3=sim3,
            sim3_init=sim_mtrx_ini,
            vis_flag=args.visualize,
            vis_dir=vis_dir)
    print('Finished. check results {}'.format(out_dir))
Example #22
0
    action='store_true',
    help='Run atari env instead with name below instead of ai2thor')
parser.add_argument('--atari-render',
                    dest='atari_render',
                    action='store_true',
                    help='Render atari')
parser.add_argument(
    '--atari-env-name',
    default='PongDeterministic-v4',
    help='environment to train on (default: PongDeterministic-v4)')
#
parser.set_defaults(atari=False)
parser.set_defaults(atari_render=False)

if __name__ == '__main__':
    mp.set_start_method('forkserver')
    os.environ['OMP_NUM_THREADS'] = '1'
    # os.environ['CUDA_VISIBLE_DEVICES'] = "1"

    torch.cuda.empty_cache()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print('Device:', device)

    args = parser.parse_args()

    torch.manual_seed(args.seed)
    if args.atari:
        env = create_atari_env(args.atari_env_name)
        args.frame_dim = 42  # fixed to be 42x42 in envs.py _process_frame42()
    else:
        args.config_dict = {'max_episode_length': args.max_episode_length}
Example #23
0
def main():
    global device
    global graphname

    print(socket.gethostname())
    seed = 0

    if not download:
        mp.set_start_method('spawn', force=True)
        outputs = None
        if "OMPI_COMM_WORLD_RANK" in os.environ.keys():
            os.environ["RANK"] = os.environ["OMPI_COMM_WORLD_RANK"]
        dist.init_process_group(backend='nccl')
        rank = dist.get_rank()
        size = dist.get_world_size()
        print("Processes: " + str(size))

        # device = torch.device('cpu')
        devid = rank_to_devid(rank, acc_per_rank)
        device = torch.device('cuda:{}'.format(devid))
        torch.cuda.set_device(device)
        curr_devid = torch.cuda.current_device()
        # print(f"curr_devid: {curr_devid}", flush=True)
        devcount = torch.cuda.device_count()

    if graphname == "Cora":
        path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data',
                        graphname)
        dataset = Planetoid(path, graphname, T.NormalizeFeatures())
        data = dataset[0]
        data = data.to(device)
        data.x.requires_grad = True
        inputs = data.x.to(device)
        inputs.requires_grad = True
        data.y = data.y.to(device)
        edge_index = data.edge_index
        num_features = dataset.num_features
        num_classes = dataset.num_classes
    elif graphname == "Reddit":
        path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data',
                        graphname)
        dataset = Reddit(path, T.NormalizeFeatures())
        data = dataset[0]
        data = data.to(device)
        data.x.requires_grad = True
        inputs = data.x.to(device)
        inputs.requires_grad = True
        data.y = data.y.to(device)
        edge_index = data.edge_index
        num_features = dataset.num_features
        num_classes = dataset.num_classes
    elif graphname == 'Amazon':
        # path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', graphname)
        # edge_index = torch.load(path + "/processed/amazon_graph.pt")
        # edge_index = torch.load("/gpfs/alpine/bif115/scratch/alokt/Amazon/processed/amazon_graph_jsongz.pt")
        # edge_index = edge_index.t_()
        print(f"Loading coo...", flush=True)
        edge_index = torch.load("../data/Amazon/processed/data.pt")
        print(f"Done loading coo", flush=True)
        # n = 9430088
        n = 14249639
        num_features = 300
        num_classes = 24
        # mid_layer = 24
        inputs = torch.rand(n, num_features)
        data = Data()
        data.y = torch.rand(n).uniform_(0, num_classes - 1).long()
        data.train_mask = torch.ones(n).long()
        # edge_index = edge_index.to(device)
        print(f"edge_index.size: {edge_index.size()}", flush=True)
        print(f"edge_index: {edge_index}", flush=True)
        data = data.to(device)
        # inputs = inputs.to(device)
        inputs.requires_grad = True
        data.y = data.y.to(device)
    elif graphname == 'subgraph3':
        # path = "/gpfs/alpine/bif115/scratch/alokt/HipMCL/"
        # print(f"Loading coo...", flush=True)
        # edge_index = torch.load(path + "/processed/subgraph3_graph.pt")
        # print(f"Done loading coo", flush=True)
        print(f"Loading coo...", flush=True)
        edge_index = torch.load("../data/subgraph3/processed/data.pt")
        print(f"Done loading coo", flush=True)
        n = 8745542
        num_features = 128
        # mid_layer = 512
        # mid_layer = 64
        num_classes = 256
        inputs = torch.rand(n, num_features)
        data = Data()
        data.y = torch.rand(n).uniform_(0, num_classes - 1).long()
        data.train_mask = torch.ones(n).long()
        print(f"edge_index.size: {edge_index.size()}", flush=True)
        data = data.to(device)
        inputs.requires_grad = True
        data.y = data.y.to(device)

    if download:
        exit()

    if normalization:
        adj_matrix, _ = add_remaining_self_loops(edge_index,
                                                 num_nodes=inputs.size(0))
    else:
        adj_matrix = edge_index

    init_process(rank, size, inputs, adj_matrix, data, num_features,
                 num_classes, device, outputs, run)

    if outputs is not None:
        return outputs[0]
Example #24
0
import torch.multiprocessing as multiprocessing
multiprocessing.set_start_method('spawn', force=True)
from models.networks.sync_batchnorm import DataParallelWithCallback
import sys
import numpy as np
import os
import data
from util.iter_counter import IterationCounter
from options.test_options import TestOptions
from models.test_model import TestModel
from util.visualizer import Visualizer
from util import html, util
from torch.multiprocessing import Process, Queue, Pool
from data.data_utils import init_parallel_jobs
from skimage import transform as trans
import cv2
import time
import torch
from models.networks.rotate_render import TestRender


def create_path(a_path, b_path):
    name_id_path = os.path.join(a_path, b_path)
    if not os.path.exists(name_id_path):
        os.makedirs(name_id_path)
    return name_id_path


def create_paths(save_path,
                 img_path,
                 foldername='orig',
Example #25
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--local_rank', type=int)
    args = parser.parse_args()
    print('what is the rank of the current program: ')
    print(args.local_rank)

    # initialize dist
    if mp.get_start_method(allow_none=True) is None:
        mp.set_start_method('spawn')
    rank = int(args.local_rank)
    torch.cuda.set_device(rank)
    dist.init_process_group(backend='nccl', init_method='env://')

    # init logger before other steps
    logger = logging.getLogger()
    if not logger.hasHandlers():
        logging.basicConfig(
            format='%(asctime)s - %(levelname)s - %(message)s',
            level=log_level
        )
    if args.local_rank != 0:
        logger.setLevel('ERROR')
    logger.info('Starting Distributed training')

    # set random seeds
    if seed is not None:
        logger.info('Set random seed to {}'.format(seed))
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

    # build dataset
    coco_dataset = CocoDataset(
        ann_file=ann_file,
        img_prefix=img_prefix,
        img_scale=img_scale,
        img_norm_cfg=img_norm_cfg,
        multiscale_mode='value', # select a scale, rather than random from a range.
        flip_ratio=flip_ratio,
        with_ignore=False,
        with_label=True,
        extra_aug=extra_aug,
        test_mode=False,
    )

    # build model
    model = SSDDetector(
        pretrained=None
    )
    model.CLASSES = coco_dataset.CLASSES

    # save class names in
    # checkpoints as meta data
    checkpoint_config['meta'] = dict(
        CLASSES=coco_dataset.CLASSES
    )

    # build sampler for shuffling, padding, and mixing.
    _, world_size = get_dist_info()
    sampler = DistributedGroupSampler(
        dataset=coco_dataset,
        samples_per_gpu=imgs_per_gpu,
        num_replicas=world_size,
        rank=args.local_rank,
    )

    # build data loader.
    data_loader = DataLoader(
        dataset=coco_dataset,
        batch_size=imgs_per_gpu,
        # shuffle should be False when sampler is given.
        shuffle=False,
        sampler=sampler,
        batch_sampler=None,
        num_workers=workers_per_gpu,
        collate_fn=partial(collate, samples_per_gpu=imgs_per_gpu),
        pin_memory=False,
        drop_last=False,
        timeout=0,
        worker_init_fn=None,
    )

    # put model on gpus
    # AdaptedDistributedDataParallel(
    #   (module): SSDDetector(
    #     (backbone): SSDVGG(
    #       (features): Sequential(
    #         (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    #         (1): ReLU(inplace=True)
    #         (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    #         (3): ReLU(inplace=True)
    #         (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=True)
    #         (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    #         (6): ReLU(inplace=True)
    #         (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    #         (8): ReLU(inplace=True)
    #         (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=True)
    #         (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    #         (11): ReLU(inplace=True)
    #         (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    #         (13): ReLU(inplace=True)
    #         (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    #         (15): ReLU(inplace=True)
    #         (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=True)
    #         (17): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    #         (18): ReLU(inplace=True)
    #         (19): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    #         (20): ReLU(inplace=True)
    #         (21): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    #         (22): ReLU(inplace=True)
    #         (23): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=True)
    #         (24): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    #         (25): ReLU(inplace=True)
    #         (26): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    #         (27): ReLU(inplace=True)
    #         (28): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    #         (29): ReLU(inplace=True)
    #         (30): MaxPool2d(kernel_size=3, stride=1, padding=1, dilation=1, ceil_mode=False)
    #         (31): Conv2d(512, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(6, 6), dilation=(6, 6))
    #         (32): ReLU(inplace=True)
    #         (33): Conv2d(1024, 1024, kernel_size=(1, 1), stride=(1, 1))
    #         (34): ReLU(inplace=True)
    #       )
    #       (extra): Sequential(
    #         (0): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
    #         (1): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    #         (2): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1))
    #         (3): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    #         (4): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1))
    #         (5): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1))
    #         (6): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1))
    #         (7): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1))
    #       )
    #       (l2_norm): L2Norm()
    #     )
    #     (bbox_head): SSDHead(
    #       (reg_convs): ModuleList(
    #         (0): Conv2d(512, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    #         (1): Conv2d(1024, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    #         (2): Conv2d(512, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    #         (3): Conv2d(256, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    #         (4): Conv2d(256, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    #         (5): Conv2d(256, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    #       )
    #       (cls_convs): ModuleList(
    #         (0): Conv2d(512, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    #         (1): Conv2d(1024, 486, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    #         (2): Conv2d(512, 486, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    #         (3): Conv2d(256, 486, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    #         (4): Conv2d(256, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    #         (5): Conv2d(256, 324, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    #       )
    #     )
    #   )
    # )
    model = MMDistributedDataParallel(model.cuda())

    # build optimizer
    if hasattr(model, 'module'):
        pure_model = model.module
    else:
        pure_model=model

    # for name, param in pure_model.named_parameters():
    #     if param.requires_grad:
    #         print(name, param.shape, param.requires_grad)
    optimizer = torch.optim.SGD(
        params=pure_model.parameters(),
        lr=lr,
        momentum=momentum,
        dampening=0,
        weight_decay=weight_decay,
        nesterov=False
    )

    # build runner: a training helper.
    #   model (:obj:`torch.nn.Module`): The model to be run.
    #   batch_processor (callable): A callable method that process a data
    #       batch. The interface of this method should be
    #       `batch_processor(model, data, train_mode) -> dict`
    #   optimizer (dict or :obj:`torch.optim.Optimizer`).
    #   work_dir (str, optional): The working directory to save checkpoints
    #       and logs.
    #   log_level (int): Logging level.
    #   logger (:obj:`logging.Logger`): Custom logger. If `None`, use the
    #       default logger.
    runner = Runner(
        model=model,
        batch_processor=batch_processor,
        optimizer=optimizer,
        work_dir=work_dir,
        log_level=logging.INFO,
        logger=None,
    )

    # register hooks: optimization after the forward
    optimizer_config = DistOptimizerHook(
        grad_clip=grad_clip,
        coalesce=True,
        bucket_size_mb=-1,
    )
    # register hooks: along with training
    runner.register_training_hooks(
        lr_config=lr_config,
        optimizer_config=optimizer_config,
        checkpoint_config=checkpoint_config,
        log_config=log_config
    )
    # register hooks: set sampler seed before each epoch
    runner.register_hook(DistSamplerSeedHook())

    # resume from: epoch and iter to be continued.
    # load from: start as 0.
    if resume_from is not None:
        runner.resume(resume_from)
    elif load_from is not None:
        runner.load_checkpoint(load_from)

    # data_loaders (list[:obj:`DataLoader`]): Dataloaders for training
    #   and validation.
    # workflow (list[tuple]): A list of (phase, epochs) to specify the
    #   running order and epochs. E.g, [('train', 2), ('val', 1)] means
    #   running 2 epochs for training and 1 epoch for validation,
    #   iteratively.
    # max_epochs (int): Total training epochs.
    runner.run(data_loaders=[data_loader], workflow=workflow, max_epochs=total_epochs)
Example #26
0
import numpy as np
import os
import torch
import torch.nn as nn
import torch.multiprocessing as mp

from utils.options import Options
from utils.factory import GlobalLogsDict, ActorLogsDict, LearnerLogsDict, EvaluatorLogsDict
from utils.factory import LoggersDict, ActorsDict, LearnersDict, EvaluatorsDict, TestersDict
from utils.factory import EnvsDict, MemoriesDict, ModelsDict

if __name__ == '__main__':
    mp.set_start_method("spawn", force=True)

    opt = Options()
    torch.manual_seed(opt.seed)

    env_prototype = EnvsDict[opt.env_type]
    memory_prototype = MemoriesDict[opt.memory_type]
    model_prototype = ModelsDict[opt.model_type]

    # dummy env to get state/action/reward/gamma/terminal_shape & action_space
    dummy_env = env_prototype(opt.env_params, 0)
    opt.norm_val = dummy_env.norm_val # use the max val of env states to normalize model inputs
    opt.state_shape = dummy_env.state_shape
    opt.action_shape = dummy_env.action_shape
    opt.action_space = dummy_env.action_space
    opt.reward_shape = opt.agent_params.num_tasks
    opt.gamma_shape = opt.agent_params.num_tasks
    opt.terminal_shape = opt.agent_params.num_tasks
    del dummy_env
Example #27
0
def init_multiprocessing():
    try:
        multiprocessing.set_start_method('fork')
    except RuntimeError:
        pass
Example #28
0
        cudnn.benchmark = True

    if cfg.resume != "":
        load_model(model, cfg.resume)
    criterion = UHT_Loss()
    optimizer = torch.optim.Adam(model.parameters(), lr=cfg.learning_rate)

    scheduler = lr_scheduler.StepLR(optimizer,
                                    step_size=len(train_loader) *
                                    cfg.decay_epoch,
                                    gamma=cfg.decay_rate)

    print('Start training Model.')
    for epoch in range(cfg.start_epoch, cfg.end_epoch):
        train(model, train_loader, criterion, scheduler, optimizer, epoch)
        if valset:
            validation(model, val_loader, criterion)

    print('End.')


if __name__ == '__main__':
    from torch.multiprocessing import set_start_method
    try:
        set_start_method('spawn')
    except RuntimeError:
        pass
    global cfg
    cfg = init_cfg()
    main()
Example #29
0
def main_eval(args, create_shared_model, init_agent):
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    random.seed(args.seed)

    if args.gpu_ids == -1:
        args.gpu_ids = [-1]
    else:
        torch.cuda.manual_seed(args.seed)
        try:
            mp.set_start_method("spawn")
        except RuntimeError:
            pass

    model_to_open = args.load_model

    processes = []

    res_queue = mp.Queue()
    if args.model == "SAVN":
        args.learned_loss = True
        args.num_steps = 6
        target = savn_val
    else:
        args.learned_loss = False
        args.num_steps = 50
        target = nonadaptivea3c_val

    rank = 0
    for scene_type in args.scene_types:
        p = mp.Process(
            target=target,
            args=(
                rank,
                args,
                model_to_open,
                create_shared_model,
                init_agent,
                res_queue,
                250,
                scene_type,
            ),
        )
        p.start()
        processes.append(p)
        time.sleep(0.1)
        rank += 1

    count = 0
    end_count = 0
    train_scalars = ScalarMeanTracker()

    train_scalars_ba = ScalarMeanTracker()
    train_scalars_be = ScalarMeanTracker()
    train_scalars_k = ScalarMeanTracker()
    train_scalars_l = ScalarMeanTracker()

    proc = len(args.scene_types)
    pbar = tqdm(total=250 * proc)

    try:
        while end_count < proc:
            train_result = res_queue.get()
            pbar.update(1)
            count += 1
            if (args.scene_types[end_count] == 'bathroom'):
                train_scalars_ba.add_scalars(train_result)
            if (args.scene_types[end_count] == 'bedroom'):
                train_scalars_be.add_scalars(train_result)
            if (args.scene_types[end_count] == 'kitchen'):
                train_scalars_k.add_scalars(train_result)
            if (args.scene_types[end_count] == 'living_room'):
                train_scalars_l.add_scalars(train_result)
            if "END" in train_result:
                end_count += 1
                continue
            train_scalars.add_scalars(train_result)

        tracked_means = train_scalars.pop_and_reset()

        tracked_means_ba = train_scalars_ba.pop_and_reset()
        tracked_means_be = train_scalars_be.pop_and_reset()
        tracked_means_k = train_scalars_k.pop_and_reset()
        tracked_means_l = train_scalars_l.pop_and_reset()

    finally:
        for p in processes:
            time.sleep(0.1)
            p.join()

    with open(args.results_json, "w") as fp:
        json.dump(tracked_means, fp, sort_keys=True, indent=4)

    # with open('all_data_'+args.results_json, "a+") as f:
    #     json.dump(args.load_model, f)
    #     json.dump(tracked_means, f, sort_keys=True, indent=4)

    if (args.room_results):
        with open('all_data_ba_' + args.results_json, "a+") as f:
            json.dump(args.load_model, f)
            json.dump(tracked_means_ba, f, sort_keys=True, indent=4)
    if (args.room_results):
        with open('all_data_be_' + args.results_json, "a+") as f:
            json.dump(args.load_model, f)
            json.dump(tracked_means_be, f, sort_keys=True, indent=4)
    if (args.room_results):
        with open('all_data_k_' + args.results_json, "a+") as f:
            json.dump(args.load_model, f)
            json.dump(tracked_means_k, f, sort_keys=True, indent=4)
    if (args.room_results):
        with open('all_data_l_' + args.results_json, "a+") as f:
            json.dump(args.load_model, f)
            json.dump(tracked_means_l, f, sort_keys=True, indent=4)
Example #30
0
    def search(self, train_data, test_data, timeout=60 * 60 * 24):
        start_time = time.time()
        torch.cuda.empty_cache()
        if not self.history:
            self.init_search()

        # Start the new process for training.
        graph, father_id, model_id = self.training_queue.pop(0)
        if self.verbose:
            print('\n')
            print('╒' + '=' * 46 + '╕')
            print('|' + 'Training model {}'.format(model_id).center(46) + '|')
            print('╘' + '=' * 46 + '╛')
        mp.set_start_method('spawn', force=True)
        pool = mp.Pool(1)
        train_results = pool.map_async(train, [(graph, train_data, test_data, self.trainer_args,
                                                os.path.join(self.path, str(model_id) + '.png'),
                                                self.metric, self.loss, self.verbose)])

        # Do the search in current thread.
        try:
            if not self.training_queue:
                new_graph, new_father_id = self.bo.optimize_acq(self.search_tree.adj_list.keys(),
                                                                self.descriptors,
                                                                timeout)
                # Did not found a new architecture
                if new_father_id is None:
                    return
                new_model_id = self.model_count
                self.model_count += 1
                self.training_queue.append((new_graph, new_father_id, new_model_id))
                self.descriptors.append(new_graph.extract_descriptor())

                if self.verbose:
                    cell_size = [24, 49]
                    header = ['Father Model ID', 'Added Operation']
                    line = '|'.join(str(x).center(cell_size[i]) for i, x in enumerate(header))
                    print('\n' + '+' + '-' * len(line) + '+')
                    print('|' + line + '|')
                    print('+' + '-' * len(line) + '+')
                    for i in range(len(new_graph.operation_history)):
                        if i == len(new_graph.operation_history) // 2:
                            r = [new_father_id, new_graph.operation_history[i]]
                        else:
                            r = [' ', new_graph.operation_history[i]]
                        line = '|'.join(str(x).center(cell_size[i]) for i, x in enumerate(r))
                        print('|' + line + '|')
                    print('+' + '-' * len(line) + '+')
            remaining_time = timeout - (time.time() - start_time)
            if remaining_time > 0:
                metric_value, loss, graph = train_results.get(timeout=remaining_time)[0]
            else:
                raise TimeoutError
        except (mp.TimeoutError, TimeoutError) as e:
            raise TimeoutError from e
        finally:
            # terminate and join the subprocess to prevent any resource leak
            pool.close()
            pool.join()
        self.add_model(metric_value, loss, graph, model_id)
        self.search_tree.add_child(father_id, model_id)
        self.bo.fit(self.x_queue, self.y_queue)
        self.x_queue = []
        self.y_queue = []

        pickle_to_file(self, os.path.join(self.path, 'searcher'))
        self.export_json(os.path.join(self.path, 'history.json'))
Example #31
0
import tempfile
import time
import uuid
from datetime import datetime
from functools import partial

# FIXME: When 'ray' is imported after 'pickle' it throws an exception.
import ray  # noqa: F401, I001
import torch.multiprocessing as multiprocessing

from experiments import CONFIGS
from nupic.research.frameworks.vernon.distributed import ImagenetExperiment
from nupic.research.frameworks.vernon.parser_utils import MAIN_PARSER, process_args
from nupic.research.frameworks.vernon.run import run, terminate_processes

multiprocessing.set_start_method("spawn", force=True)


def create_trials(config):
    """
    Create trial configuration for each trial variant evaluating 'ray.tune'
    functions (grid_search, sample_from, ...) into its final values and
    creating the local and log dir for each trial

    :param config: Ray tune configuration with 'ray.tune' functions
    :return: list of dict for each trial configuration variant
    """
    from nupic.research.support.ray_utils import generate_trial_variants

    trials = generate_trial_variants(config)
    timestamp = datetime.today().strftime("%Y-%m-%d_%H-%M-%S")
Example #32
0
from torch.utils.data.dataloader import default_collate, ExceptionWrapper, MANAGER_STATUS_CHECK_INTERVAL
from common import TestCase, run_tests, TEST_NUMPY, IS_WINDOWS

# We cannot import TEST_CUDA from common_nn here, because if we do that,
# the TEST_CUDNN line from common_nn will be executed multiple times
# as well during the execution of this test suite, and it will cause
# CUDA OOM error on Windows.
TEST_CUDA = torch.cuda.is_available()

# We need spawn start method for test_manager_unclean_exit, but
# Python 2.7 doesn't allow it.
if sys.version_info[0] == 3:
    # Without the try-catch block, some tests will complain that
    # context has already been set.
    try:
        multiprocessing.set_start_method('spawn')
    except RuntimeError:
        pass


JOIN_TIMEOUT = 17.0 if IS_WINDOWS else 6.5


class TestDatasetRandomSplit(TestCase):
    def test_lengths_must_equal_datset_size(self):
        with self.assertRaises(ValueError):
            random_split([1, 2, 3, 4], [1, 2])

    def test_splits_have_correct_size(self):
        splits = random_split([1, 2, 3, 4, 5, 6], [2, 4])
        self.assertEqual(len(splits), 2)
Example #33
0
def main():

    actor_critic = False
    agent = False
    past_steps = 0
    try:
        os.makedirs(args.log_dir)
    except OSError:
        files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv'))
        for f in files:
            if args.overwrite:
                os.remove(f)
            else:
                pass
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None
        win_eval = None

    import torch.multiprocessing as multiprocessing
    multiprocessing.set_start_method('spawn')
    torch.manual_seed(args.seed)
    envs = make_vec_envs(args.env_name,
                         args.seed,
                         args.num_processes,
                         args.gamma,
                         args.log_dir,
                         args.add_timestep,
                         device,
                         False,
                         None,
                         args=args)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)
    num_actions = 1
    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={
                              'map_width': args.map_width,
                              'num_actions': num_actions,
                              'recurrent': args.recurrent_policy
                          },
                          curiosity=args.curiosity,
                          algo=args.algo,
                          model=args.model,
                          args=args)

    evaluator = None

    if not agent:
        if args.algo == 'a2c':
            agent = algo.A2C_ACKTR_NOREWARD(actor_critic,
                                            args.value_loss_coef,
                                            args.entropy_coef,
                                            lr=args.lr,
                                            eps=args.eps,
                                            alpha=args.alpha,
                                            max_grad_norm=args.max_grad_norm,
                                            curiosity=args.curiosity,
                                            args=args)
        elif args.algo == 'ppo':
            agent = algo.PPO(actor_critic,
                             args.clip_param,
                             args.ppo_epoch,
                             args.num_mini_batch,
                             args.value_loss_coef,
                             args.entropy_coef,
                             lr=args.lr,
                             eps=args.eps,
                             max_grad_norm=args.max_grad_norm)
        elif args.algo == 'acktr':
            agent = algo.A2C_ACKTR_NOREWARD(actor_critic,
                                            args.value_loss_coef,
                                            args.entropy_coef,
                                            lr=args.lr,
                                            eps=args.eps,
                                            alpha=args.alpha,
                                            max_grad_norm=args.max_grad_norm,
                                            acktr=True,
                                            curiosity=args.curiosity,
                                            args=args)

#saved_model = os.path.join(args.save_dir, args.env_name + '.pt')
    saved_model = os.path.join(args.save_dir, args.env_name + '.tar')
    if os.path.exists(saved_model) and not args.overwrite:
        checkpoint = torch.load(saved_model)
        actor_critic.load_state_dict(checkpoint['model_state_dict'])
        actor_critic.to(device)
        agent.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        past_steps = checkpoint['past_steps']
        ob_rms = checkpoint['ob_rms']
        past_steps = next(iter(
            agent.optimizer.state_dict()['state'].values()))['step']
        saved_args = checkpoint['args']
        new_recs = args.n_recs - saved_args.n_recs
        for nr in range(new_recs):
            actor_critic.base.auto_expand()
        if saved_args.n_recs > args.n_recs:
            print('applying {} fractal expansions to network'.format(
                saved_args.n_recs - args.n_recs))
        print('Resuming from step {}'.format(past_steps))

        #print(type(next(iter((torch.load(saved_model))))))
        #actor_critic, ob_rms = \
        #        torch.load(saved_model)
        #agent = \
        #    torch.load(os.path.join(args.save_dir, args.env_name + '_agent.pt'))
        #if not agent.optimizer.state_dict()['state'].values():
        #    past_steps = 0
        #else:

        #    raise Exception

        vec_norm = get_vec_normalize(envs)
        if vec_norm is not None:
            vec_norm.eval()
            vec_norm.ob_rms = ob_rms
    actor_critic.to(device)

    if 'LSTM' in args.model:
        recurrent_hidden_state_size = actor_critic.base.get_recurrent_state_size(
        )
    else:
        recurrent_hidden_state_size = actor_critic.recurrent_hidden_state_size
    if args.curiosity:
        rollouts = CuriosityRolloutStorage(
            args.num_steps,
            args.num_processes,
            envs.observation_space.shape,
            envs.action_space,
            recurrent_hidden_state_size,
            actor_critic.base.feature_state_size(),
            args=args)
    else:
        rollouts = RolloutStorage(args.num_steps,
                                  args.num_processes,
                                  envs.observation_space.shape,
                                  envs.action_space,
                                  recurrent_hidden_state_size,
                                  args=args)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    model = actor_critic.base
    done = False
    for j in range(past_steps, num_updates):
        if args.model == 'fractal' and args.drop_path:
            model.set_drop_path()
        if args.model == 'fixed' and model.RAND:
            model.num_recursions = random.randint(1, model.map_width * 2)

        player_act = None
        for step in range(args.num_steps):
            # if type(done) is not bool:
            #     if done.any():
            #         obs = envs.reset()
            # elif done:
            #     obs = env.reset()
            # Sample actions
            with torch.no_grad():

                value, action, action_log_probs, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step],
                    rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step],
                    player_act=player_act,
                    icm_enabled=args.curiosity,
                    deterministic=False)

            # Observe reward and next obs
            obs, reward, done, infos = envs.step(action)

            player_act = None
            if args.render:

                if infos[0]:
                    if 'player_move' in infos[0].keys():
                        player_act = infos[0]['player_move']

            if args.curiosity:
                # run icm
                with torch.no_grad():

                    feature_state, feature_state_pred, action_dist_pred = actor_critic.icm_act(
                        (rollouts.obs[step], obs, action_bin))

                intrinsic_reward = args.eta * (
                    (feature_state - feature_state_pred).pow(2)).sum() / 2.
                if args.no_reward:
                    reward = 0
                reward += intrinsic_reward.cpu()

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            if args.curiosity:
                rollouts.insert(obs, recurrent_hidden_states, action,
                                action_log_probs, value, reward, masks,
                                feature_state, feature_state_pred, action_bin,
                                action_dist_pred)
            else:
                rollouts.insert(obs, recurrent_hidden_states, action,
                                action_log_probs, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)
        if args.curiosity:
            value_loss, action_loss, dist_entropy, fwd_loss, inv_loss = agent.update(
                rollouts)
        else:
            value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            ob_rms = getattr(get_vec_normalize(envs), 'ob_rms', None)
            save_model = copy.deepcopy(actor_critic)
            save_agent = copy.deepcopy(agent)
            if args.cuda:
                save_model.cpu()
            optim_save = save_agent.optimizer.state_dict()

            # experimental:
            torch.save(
                {
                    'past_steps': step,
                    'model_state_dict': save_model.state_dict(),
                    'optimizer_state_dict': optim_save,
                    'ob_rms': ob_rms,
                    'args': args
                }, os.path.join(save_path, args.env_name + ".tar"))

        #save_model = [save_model,
        #              getattr(get_vec_normalize(envs), 'ob_rms', None)]

        #torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))
        #save_agent = copy.deepcopy(agent)

        #torch.save(save_agent, os.path.join(save_path, args.env_name + '_agent.pt'))
        #torch.save(actor_critic.state_dict(), os.path.join(save_path, args.env_name + "_weights.pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if not dist_entropy:
            dist_entropy = 0
        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n \
dist entropy {:.1f}, val/act loss {:.1f}/{:.1f},".format(
                    j, total_num_steps,
                    int((total_num_steps -
                         past_steps * args.num_processes * args.num_steps) /
                        (end - start)), len(episode_rewards),
                    np.mean(episode_rewards), np.median(episode_rewards),
                    np.min(episode_rewards), np.max(episode_rewards),
                    dist_entropy, value_loss, action_loss))
            if args.curiosity:
                print("fwd/inv icm loss {:.1f}/{:.1f}\n".format(
                    fwd_loss, inv_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            if evaluator is None:
                evaluator = Evaluator(args, actor_critic, device)

            model = evaluator.actor_critic.base
            if args.model == 'fractal':
                n_cols = model.n_cols
                if args.rule == 'wide1' and args.n_recs > 3:
                    col_step = 3
                else:
                    col_step = 1
                col_idx = [-1, *range(0, n_cols, col_step)]
                for i in col_idx:
                    evaluator.evaluate(column=i)
            #num_eval_frames = (args.num_frames // (args.num_steps * args.eval_interval * args.num_processes)) * args.num_processes *  args.max_step
            # making sure the evaluator plots the '-1'st column (the overall net)
                win_eval = visdom_plot(viz,
                                       win_eval,
                                       evaluator.eval_log_dir,
                                       graph_name,
                                       args.algo,
                                       args.num_frames,
                                       n_graphs=col_idx)
            elif args.model == 'fixed' and model.RAND:
                for i in model.eval_recs:
                    evaluator.evaluate(num_recursions=i)
                win_eval = visdom_plot(viz,
                                       win_eval,
                                       evaluator.eval_log_dir,
                                       graph_name,
                                       args.algo,
                                       args.num_frames,
                                       n_graphs=model.eval_recs)
            else:
                evaluator.evaluate(column=None)
                win_eval = visdom_plot(
                    viz, win_eval, evaluator.eval_log_dir, graph_name,
                    args.algo, args.num_frames * 20 /
                    (args.eval_interval * args.num_steps))

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, graph_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass