Example #1
0
def load_results(training_dir):
    if not os.path.exists(training_dir):
        logger.error('Training directory %s not found', training_dir)
        return

    manifests = detect_training_manifests(training_dir)
    if not manifests:
        logger.error('No manifests found in training directory %s', training_dir)
        return

    logger.debug('Uploading data from manifest %s', ', '.join(manifests))

    # Load up stats + video files
    stats_files = []
    videos = []
    env_infos = []

    for manifest in manifests:
        with open(manifest) as f:
            contents = json.load(f)
            # Make these paths absolute again
            stats_files.append(os.path.join(training_dir, contents['stats']))
            videos += [(os.path.join(training_dir, v), os.path.join(training_dir, m))
                       for v, m in contents['videos']]
            env_infos.append(contents['env_info'])

    env_info = collapse_env_infos(env_infos, training_dir)
    data_sources, initial_reset_timestamps, timestamps, episode_lengths, episode_rewards, episode_types, initial_reset_timestamp = merge_stats_files(stats_files)

    return {
        'manifests': manifests,
        'env_info': env_info,
        'data_sources': data_sources,
        'timestamps': timestamps,
        'episode_lengths': episode_lengths,
        'episode_rewards': episode_rewards,
        'episode_types': episode_types,
        'initial_reset_timestamps': initial_reset_timestamps,
        'initial_reset_timestamp': initial_reset_timestamp,
        'videos': videos,
    }
Example #2
0
    def __init__(self, seq, collision_penalty=-2, trap_penalty=0.5):
        """Initializes the lattice

        Parameters
        ----------
        seq : str, must only consist of 'H' or 'P'
            Sequence containing the polymer chain.
        collision_penalty : int, must be a negative value
            Penalty incurred when the agent made an invalid action.
            Default is -2.
        trap_penalty : float, must be between 0 and 1
            Penalty incurred when the agent is trapped. Actual value is
            computed as :code:`floor(length_of_sequence * trap_penalty)`
            Default is -2.

        Raises
        ------
        AssertionError
            If a certain polymer is not 'H' or 'P'
        """
        try:
            if not set(seq.upper()) <= set('HP'):
                raise ValueError("%r (%s) is an invalid sequence" %
                                 (seq, type(seq)))
            self.seq = seq.upper()
        except AttributeError:
            logger.error("%r (%s) must be of type 'str'" % (seq, type(seq)))
            raise

        try:
            if collision_penalty >= 0:
                raise ValueError("%r (%s) must be negative" %
                                 (collision_penalty, type(collision_penalty)))
            if not isinstance(collision_penalty, int):
                raise ValueError("%r (%s) must be of type 'int'" %
                                 (collision_penalty, type(collision_penalty)))
            self.collision_penalty = collision_penalty
        except TypeError:
            logger.error("%r (%s) must be of type 'int'" %
                         (collision_penalty, type(collision_penalty)))
            raise

        try:
            if not 0 < trap_penalty < 1:
                raise ValueError("%r (%s) must be between 0 and 1" %
                                 (trap_penalty, type(trap_penalty)))
            self.trap_penalty = trap_penalty
        except TypeError:
            logger.error("%r (%s) must be of type 'float'" %
                         (trap_penalty, type(trap_penalty)))
            raise

        # Grid attributes
        self.grid_length = int(2 * (len(seq) + 1))
        self.midpoint = (int(len(seq)), int(len(seq)))
        self.grid = np.zeros(shape=(self.grid_length, self.grid_length),
                             dtype=int)

        # Automatically assign first element into grid
        self.grid[self.midpoint] = POLY_TO_INT[self.seq[0]]

        res_0 = Residue(0, self.seq[0], (0, 0))
        # Define action-observation spaces
        self.action_space = spaces.Discrete(4)
        self.observation_space = spaces.Box(low=-2,
                                            high=1,
                                            shape=(self.grid_length,
                                                   self.grid_length),
                                            dtype=int)
 def close(self):
     self.proc.stdin.close()
     ret = self.proc.wait()
     if ret != 0:
         logger.error("VideoRecorder encoder exited with status {}".format(ret))
Example #4
0
def error(msg, *args):
    logger.error('ue4ml: ' + msg, *args)
Example #5
0
    def step(self, action):
        """Updates the current chain with the specified action.

        The action supplied by the agent should be an integer from 0
        to 3. In this case:
            - 0 : left
            - 1 : down
            - 2 : up
            - 3 : right
        The best way to remember this is to note that they are similar to the
        'h', 'j', 'k', and 'l' keys in vim.

        This method returns a set of values similar to the OpenAI gym, that
        is, a tuple :code:`(observations, reward, done, info)`.

        The observations are arranged as a :code:`numpy.ndarray` matrix, more
        suitable for agents built using convolutional neural networks. The
        'H' is represented as :code:`1`s whereas the 'P's as :code:`-1`s.
        However, for the actual chain, that is, an :code:`OrderedDict` and
        not its grid-like representation, can be accessed from
        :code:`info['state_chain]`.

        The reward is calculated at the end of every episode, that is, when
        the length of the chain is equal to the length of the input sequence.

        Parameters
        ----------
        action : int, {0, 1, 2, 3}
            Specifies the position where the next polymer will be placed
            relative to the previous one:
                - 0 : left
                - 1 : down
                - 2 : up
                - 3 : right

        Returns
        -------
        numpy.ndarray
            Current state of the lattice.
        int or None
            Reward for the current episode.
        bool
            Control signal when the episode ends.
        dict
            Additional information regarding the environment.

        Raises
        ------
        AssertionError
            When the specified action is invalid.
        IndexError
            When :code:`step()` is still called even if done signal
            is already :code:`True`.
        """
        if not self.action_space.contains(action):
            raise ValueError("%r (%s) invalid" % (action, type(action)))

        self.last_action = action
        is_trapped = False  # Trap signal
        collision = False  # Collision signal
        # Obtain coordinate of previous polymer
        x, y = next(reversed(self.state))
        # Get all adjacent coords and next move based on action
        adj_coords = self._get_adjacent_coords((x, y))
        next_move = adj_coords[action]
        # Detects for collision or traps in the given coordinate
        idx = len(self.state)
        if set(adj_coords.values()).issubset(self.state.keys()):
            logger.warn('Your agent was trapped! Ending the episode.')
            self.trapped += 1
            is_trapped = True
        elif next_move in self.state:
            self.collisions += 1
            collision = True
        else:
            self.actions.append(action)
            try:
                self.state.update({next_move: self.seq[idx]})
            except IndexError:
                logger.error(
                    'All molecules have been placed! Nothing can be added to the protein chain.'
                )
                raise

        # Set-up return values
        grid = self._draw_grid(self.state)
        done = True if (len(self.state) == len(self.seq)
                        or is_trapped) else False
        reward = self._compute_reward(is_trapped, collision, done)
        info = {
            'chain_length': len(self.state),
            'seq_length': len(self.seq),
            'collisions': self.collisions,
            'actions': [ACTION_TO_STR[i] for i in self.actions],
            'is_trapped': is_trapped,
            'state_chain': self.state
        }

        return (grid, reward, done, info)
Example #6
0
            else:
                envs.close()
            logger.info("Killed envs.")
        except UnboundLocalError:
            logger.info("No envs to kill!")


if is_interactive() and __name__ == '__main__':
    assert LOG_DIR, 'log dir cannot be empty'
    os.makedirs(LOG_DIR, exist_ok=True)
    subprocess.call("rm -rf {}/*".format(LOG_DIR), shell=True)
    ex.observers.append(FileStorageObserverWithExUuid.create(LOG_DIR))
    ex.run_commandline('run_config with \
            uuid="gibson_random" \
            cfg.env.num_processes=1\
            '.format())
elif __name__ == '__main__':
    assert LOG_DIR, 'log dir cannot be empty'
    os.makedirs(LOG_DIR, exist_ok=True)
    subprocess.call("rm -rf {}/*".format(LOG_DIR), shell=True)
    ex.observers.append(FileStorageObserverWithExUuid.create(LOG_DIR))
    try:
        ex.run_commandline()
    except FileNotFoundError as e:
        logger.error(
            f'File not found! Are you trying to test an experiment with the uuid: {e}?'
        )
        raise e
else:
    logger.info(__name__)
Example #7
0
 def close(self):
     """Closes the Image encoder."""
     self.proc.stdin.close()
     ret = self.proc.wait()
     if ret != 0:
         logger.error(f"VideoRecorder encoder exited with status {ret}")
Example #8
0
def run_training(cfg, uuid, override={}):
    try:
        logger.info("-------------\nStarting with configuration:\n" + pprint.pformat(cfg))
        logger.info("UUID: " + uuid)
        torch.set_num_threads(1)
        set_seed(cfg['training']['seed'])

        # get new output_dir name (use for checkpoints)
        old_log_dir = cfg['saving']['log_dir']
        changed_log_dir = False
        existing_log_paths = []
        if os.path.exists(old_log_dir) and cfg['saving']['autofix_log_dir']:
            LOG_DIR, existing_log_paths = evkit.utils.logging.unused_dir_name(old_log_dir)
            os.makedirs(LOG_DIR, exist_ok=False)
            cfg['saving']['log_dir'] = LOG_DIR
            cfg['saving']['results_log_file'] = os.path.join(LOG_DIR, 'result_log.pkl')
            cfg['saving']['reward_log_file'] = os.path.join(LOG_DIR, 'rewards.pkl')
            cfg['saving']['visdom_log_file'] = os.path.join(LOG_DIR, 'visdom_logs.json')
            changed_log_dir = True

        # Load checkpoint, config, agent
        agent = None

        if cfg['training']['resumable']:
            if cfg['saving']['checkpoint']:
                prev_run_path = cfg['saving']['checkpoint']
                if cfg['saving']['checkpoint_num'] is None:
                    ckpt_fpath = os.path.join(prev_run_path, 'checkpoints', 'ckpt-latest.dat')
                else:
                    ckpt_fpath = os.path.join(prev_run_path, 'checkpoints', f"ckpt-{cfg['saving']['checkpoint_num']}.dat")
                if cfg['saving']['checkpoint_configs']:  # update configs with values from ckpt
                    prev_run_metadata_paths = [os.path.join(prev_run_path, f)
                                               for f in os.listdir(prev_run_path)
                                               if f.endswith('metadata')]
                    prev_run_config_path = os.path.join(prev_run_metadata_paths[0], 'config.json')
                    with open(prev_run_config_path) as f:
                        config = json.load(f)  # keys are ['cfg', 'uuid', 'seed']
                    true_log_dir = cfg['saving']['log_dir']
                    cfg = update_dict_deepcopy(cfg, config['cfg'])
                    uuid = config['uuid']
                    logger.warning("Reusing config from {}".format(prev_run_config_path))
                    # the saving files should always use the new log dir
                    cfg['saving']['log_dir'] = true_log_dir
                    cfg['saving']['results_log_file'] = os.path.join(true_log_dir, 'result_log.pkl')
                    cfg['saving']['reward_log_file'] = os.path.join(true_log_dir, 'rewards.pkl')
                    cfg['saving']['visdom_log_file'] = os.path.join(true_log_dir, 'visdom_logs.json')
                if ckpt_fpath is not None and os.path.exists(ckpt_fpath):
                    checkpoint_obj = torch.load(ckpt_fpath)
                    start_epoch = checkpoint_obj['epoch']
                    logger.info("Loaded learner (epoch {}) from {}".format(start_epoch, ckpt_fpath))
                    if cfg['learner']['algo'] == 'imitation_learning':
                        actor_critic = checkpoint_obj['model']
                        try:
                            actor_critic = actor_critic.module  # remove DataParallel
                        except:
                            pass
                    else:
                        agent = checkpoint_obj['agent']
                        actor_critic = agent.actor_critic
                else:
                    logger.warning("No checkpoint found at {}".format(ckpt_fpath))
        cfg = update_dict_deepcopy(cfg, override)
        logger.info("-------------\n Running with configuration:\n" + pprint.pformat(cfg))

        # Verify configs are consistent - baked version needs to match un-baked version
        try:
            taskonomy_transform = cfg['env']['transform_fn_post_aggregation_kwargs']['names_to_transforms']['taskonomy']
            taskonomy_encoder = cfg['learner']['perception_network_kwargs']['extra_kwargs']['sidetune_kwargs']['base_weights_path']
            assert taskonomy_encoder in taskonomy_transform, f'Taskonomy PostTransform and perception network base need to match. {taskonomy_encoder} != {taskonomy_transform}'
        except KeyError:
            pass

        if cfg['training']['gpu_devices'] is None:
            cfg['training']['gpu_devices'] = list(range(torch.cuda.device_count()))
        assert not (len(cfg['training']['gpu_devices']) > 1 and 'attributes' in cfg['learner']['cache_kwargs']), 'Cannot utilize cache with more than one model GPU'

        # Make environment
        simulator, scenario = cfg['env']['env_name'].split('_')

        transform_pre_aggregation = None
        if cfg['env']['transform_fn_pre_aggregation'] is not None:
            logger.warning('Using depreciated config transform_fn_pre_aggregation')
            transform_pre_aggregation = eval(cfg['env']['transform_fn_pre_aggregation'].replace("---", "'"))
        elif 'transform_fn_pre_aggregation_fn' in cfg['env'] and cfg['env'][
            'transform_fn_pre_aggregation_fn'] is not None:
            pre_aggregation_kwargs = copy.deepcopy(cfg['env']['transform_fn_pre_aggregation_kwargs'])
            transform_pre_aggregation = eval(cfg['env']['transform_fn_pre_aggregation_fn'].replace("---", "'"))(
                **eval_dict_values(pre_aggregation_kwargs))

        if 'debug_mode' in cfg['env']['env_specific_kwargs'] and cfg['env']['env_specific_kwargs']['debug_mode']:
            assert cfg['env']['num_processes'] == 1, 'Using debug mode requires you to only use one process'

        envs = EnvFactory.vectorized(
            cfg['env']['env_name'],
            cfg['training']['seed'],
            cfg['env']['num_processes'],
            cfg['saving']['log_dir'],
            cfg['env']['add_timestep'],
            env_specific_kwargs=cfg['env']['env_specific_kwargs'],
            num_val_processes=cfg['env']['num_val_processes'],
            preprocessing_fn=transform_pre_aggregation,
            addl_repeat_count=cfg['env']['additional_repeat_count'],
            sensors=cfg['env']['sensors'],
            vis_interval=cfg['saving']['vis_interval'],
            visdom_server=cfg['saving']['visdom_server'],
            visdom_port=cfg['saving']['visdom_port'],
            visdom_log_file=cfg['saving']['visdom_log_file'],
            visdom_name=uuid)

        transform_post_aggregation = None
        if 'transform_fn_post_aggregation' in cfg['env'] and cfg['env']['transform_fn_post_aggregation'] is not None:
            logger.warning('Using depreciated config transform_fn_post_aggregation')
            transform_post_aggregation = eval(cfg['env']['transform_fn_post_aggregation'].replace("---", "'"))
        elif 'transform_fn_post_aggregation_fn' in cfg['env'] and cfg['env'][
            'transform_fn_post_aggregation_fn'] is not None:
            post_aggregation_kwargs = copy.deepcopy(cfg['env']['transform_fn_post_aggregation_kwargs'])
            transform_post_aggregation = eval(cfg['env']['transform_fn_post_aggregation_fn'].replace("---", "'"))(
                **eval_dict_values(post_aggregation_kwargs))

        if transform_post_aggregation is not None:
            transform, space = transform_post_aggregation(envs.observation_space)
            envs = ProcessObservationWrapper(envs, transform, space)

        action_space = envs.action_space
        observation_space = envs.observation_space
        retained_obs_shape = {k: v.shape
                              for k, v in observation_space.spaces.items()
                              if k in cfg['env']['sensors']}
        logger.info(f"Action space: {action_space}")
        logger.info(f"Observation space: {observation_space}")
        logger.info(
            "Retaining: {}".format(set(observation_space.spaces.keys()).intersection(cfg['env']['sensors'].keys())))

        # Finish setting up the agent
        if agent == None and cfg['learner']['algo'] == 'ppo':
            perception_model = eval(cfg['learner']['perception_network'])(
                cfg['learner']['num_stack'],
                **cfg['learner']['perception_network_kwargs'])
            base = NaivelyRecurrentACModule(
                perception_unit=perception_model,
                use_gru=cfg['learner']['recurrent_policy'],
                internal_state_size=cfg['learner']['internal_state_size'])
            actor_critic = PolicyWithBase(
                base, action_space,
                num_stacks=cfg['learner']['num_stack'],
                takeover=None,
                loss_kwargs=cfg['learner']['loss_kwargs'],
                gpu_devices=cfg['training']['gpu_devices'],
            )
            if cfg['learner']['use_replay']:
                agent = evkit.rl.algo.PPOReplay(actor_critic,
                                                cfg['learner']['clip_param'],
                                                cfg['learner']['ppo_epoch'],
                                                cfg['learner']['num_mini_batch'],
                                                cfg['learner']['value_loss_coef'],
                                                cfg['learner']['entropy_coef'],
                                                cfg['learner']['on_policy_epoch'],
                                                cfg['learner']['off_policy_epoch'],
                                                cfg['learner']['num_steps'],
                                                cfg['learner']['num_stack'],
                                                lr=cfg['learner']['lr'],
                                                eps=cfg['learner']['eps'],
                                                max_grad_norm=cfg['learner']['max_grad_norm'],
                                                gpu_devices=cfg['training']['gpu_devices'],
                                                loss_kwargs=cfg['learner']['loss_kwargs'],
                                                cache_kwargs=cfg['learner']['cache_kwargs'],
                                                optimizer_class = cfg['learner']['optimizer_class'],
                                                optimizer_kwargs = cfg['learner']['optimizer_kwargs']
                )
            else:
                agent = evkit.rl.algo.PPO(actor_critic,
                                          cfg['learner']['clip_param'],
                                          cfg['learner']['ppo_epoch'],
                                          cfg['learner']['num_mini_batch'],
                                          cfg['learner']['value_loss_coef'],
                                          cfg['learner']['entropy_coef'],
                                          lr=cfg['learner']['lr'],
                                          eps=cfg['learner']['eps'],
                                          max_grad_norm=cfg['learner']['max_grad_norm']
                )
            start_epoch = 0

            # Set up data parallel
            if torch.cuda.device_count() > 1 and (cfg['training']['gpu_devices'] is None or len(cfg['training']['gpu_devices']) > 1):
                actor_critic.data_parallel(cfg['training']['gpu_devices'])

        elif agent == None and cfg['learner']['algo'] == 'slam':
            assert cfg['learner']['slam_class'] is not None, 'Must define SLAM agent class'
            actor_critic = eval(cfg['learner']['slam_class'])(**cfg['learner']['slam_kwargs'])
            start_epoch = 0

        elif cfg['learner']['algo'] == 'expert':
            actor_critic = eval(cfg['learner']['algo_class'])(**cfg['learner']['algo_kwargs'])
            start_epoch = 0

        if cfg['learner']['algo'] == 'expert':
            assert 'debug_mode' in cfg['env']['env_specific_kwargs'] and cfg['env']['env_specific_kwargs']['debug_mode'], 'need to use debug mode with expert algo'

        if cfg['learner']['perception_network_reinit'] and cfg['learner']['algo'] == 'ppo':
            logger.info('Reinit perception network, use with caution')
            # do not reset map_tower and other parts of the TaskonomyFeaturesOnlyNetwork
            old_perception_unit = actor_critic.base.perception_unit
            new_perception_unit = eval(cfg['learner']['perception_network'])(
                cfg['learner']['num_stack'],
                **cfg['learner']['perception_network_kwargs'])
            new_perception_unit.main_perception = old_perception_unit  # main perception does not change
            actor_critic.base.perception_unit = new_perception_unit  # only x['taskonomy'] changes

            # match important configs of old model
            if (actor_critic.gpu_devices == None or len(actor_critic.gpu_devices) == 1) and len(cfg['training']['gpu_devices']) > 1:
                actor_critic.data_parallel(cfg['training']['gpu_devices'])
            actor_critic.gpu_devices = cfg['training']['gpu_devices']
            agent.gpu_devices = cfg['training']['gpu_devices']

        # Machinery for storing rollouts
        num_train_processes = cfg['env']['num_processes'] - cfg['env']['num_val_processes']
        num_val_processes = cfg['env']['num_val_processes']
        assert cfg['learner']['test'] or (cfg['env']['num_val_processes'] < cfg['env']['num_processes']), \
            "Can't train without some training processes!"
        current_obs = StackedSensorDictStorage(cfg['env']['num_processes'], cfg['learner']['num_stack'],
                                               retained_obs_shape)
        if not cfg['learner']['test']:
            current_train_obs = StackedSensorDictStorage(num_train_processes, cfg['learner']['num_stack'],
                                                         retained_obs_shape)
        logger.debug(f'Stacked obs shape {current_obs.obs_shape}')

        if cfg['learner']['use_replay'] and not cfg['learner']['test']:
            rollouts = RolloutSensorDictReplayBuffer(
                cfg['learner']['num_steps'],
                num_train_processes,
                current_obs.obs_shape,
                action_space,
                cfg['learner']['internal_state_size'],
                actor_critic,
                cfg['learner']['use_gae'],
                cfg['learner']['gamma'],
                cfg['learner']['tau'],
                cfg['learner']['replay_buffer_size'],
                batch_multiplier=cfg['learner']['rollout_value_batch_multiplier']
            )
        else:
            rollouts = RolloutSensorDictStorage(
                cfg['learner']['num_steps'],
                num_train_processes,
                current_obs.obs_shape,
                action_space,
                cfg['learner']['internal_state_size'])

        # Set up logging
        if cfg['saving']['logging_type'] == 'visdom':
            mlog = tnt.logger.VisdomMeterLogger(
                title=uuid, env=uuid,
                server=cfg['saving']['visdom_server'],
                port=cfg['saving']['visdom_port'],
                log_to_filename=cfg['saving']['visdom_log_file'])
        elif cfg['saving']['logging_type'] == 'tensorboard':
            mlog = tnt.logger.TensorboardMeterLogger(
                env=uuid,
                log_dir=cfg['saving']['log_dir'],
                plotstylecombined=True)
        else:
            raise NotImplementedError("Unknown logger type: ({cfg['saving']['logging_type']})")

        # Add metrics and logging to TB/Visdom
        loggable_metrics = ['metrics/rewards',
                            'diagnostics/dist_perplexity',
                            'diagnostics/lengths',
                            'diagnostics/max_importance_weight',
                            'diagnostics/value',
                            'losses/action_loss',
                            'losses/dist_entropy',
                            'losses/value_loss',
                            'introspect/alpha']
        if 'intrinsic_loss_types' in cfg['learner']['loss_kwargs']:
            for iloss in cfg['learner']['loss_kwargs']['intrinsic_loss_types']:
                loggable_metrics.append(f"losses/{iloss}")
        core_metrics = ['metrics/rewards', 'diagnostics/lengths']
        debug_metrics = ['debug/input_images']
        if 'habitat' in cfg['env']['env_name'].lower():
            for metric in ['metrics/collisions', 'metrics/spl', 'metrics/success']:
                loggable_metrics.append(metric)
                core_metrics.append(metric)
        for meter in loggable_metrics:
            mlog.add_meter(meter, tnt.meter.ValueSummaryMeter())
        for debug_meter in debug_metrics:
            mlog.add_meter(debug_meter, tnt.meter.SingletonMeter(), ptype='image')
        try:
            for attr in cfg['learner']['perception_network_kwargs']['extra_kwargs']['attrs_to_remember']:
                mlog.add_meter(f'diagnostics/{attr}', tnt.meter.ValueSummaryMeter(), ptype='histogram')
        except KeyError:
            pass

        mlog.add_meter('config', tnt.meter.SingletonMeter(), ptype='text')
        mlog.update_meter(cfg_to_md(cfg, uuid), meters={'config'}, phase='train')

        # File loggers
        flog = tnt.logger.FileLogger(cfg['saving']['results_log_file'], overwrite=True)
        try:
            flog_keys_to_remove = [f'diagnostics/{k}' for k in cfg['learner']['perception_network_kwargs']['extra_kwargs']['attrs_to_remember']]
        except KeyError:
            warnings.warn('Unable to find flog keys to remove')
            flog_keys_to_remove = []
        reward_only_flog = tnt.logger.FileLogger(cfg['saving']['reward_log_file'], overwrite=True)

        # replay data to mlog, move metadata file
        if changed_log_dir:
            evkit.utils.logging.replay_logs(existing_log_paths, mlog)
            evkit.utils.logging.move_metadata_file(old_log_dir, cfg['saving']['log_dir'], uuid)

        ##########
        # LEARN! #
        ##########
        if cfg['training']['cuda']:
            if not cfg['learner']['test']:
                current_train_obs = current_train_obs.cuda(device=cfg['training']['gpu_devices'][0])
            current_obs = current_obs.cuda(device=cfg['training']['gpu_devices'][0])
            # rollouts.cuda(device=cfg['training']['gpu_devices'][0])  # rollout should be on RAM
            try:
                actor_critic.cuda(device=cfg['training']['gpu_devices'][0])
            except UnboundLocalError as e:
                logger.error(f'Cannot put actor critic on cuda. Are you using a checkpoint and is it being found/initialized properly? {e}')
                raise e

        # These variables are used to compute average rewards for all processes.
        episode_rewards = torch.zeros([cfg['env']['num_processes'], 1])
        episode_lengths = torch.zeros([cfg['env']['num_processes'], 1])
        episode_tracker = evkit.utils.logging.EpisodeTracker(cfg['env']['num_processes'])
        if cfg['learner']['test']:
            all_episodes = []
            actor_critic.eval()
            try:
                actor_critic.base.perception_unit.sidetuner.attrs_to_remember = []
            except:
                pass

        # First observation
        obs = envs.reset()
        current_obs.insert(obs)
        mask_done = torch.FloatTensor([[0.0] for _ in range(cfg['env']['num_processes'])]).cuda(device=cfg['training']['gpu_devices'][0], non_blocking=True)
        states = torch.zeros(cfg['env']['num_processes'], cfg['learner']['internal_state_size']).cuda(device=cfg['training']['gpu_devices'][0], non_blocking=True)
        try:
            actor_critic.reset(envs=envs)
        except:
            actor_critic.reset()

        # Main loop
        start_time = time.time()
        n_episodes_completed = 0
        num_updates = int(cfg['training']['num_frames']) // (cfg['learner']['num_steps'] * cfg['env']['num_processes'])
        if cfg['learner']['test']:
            logger.info(f"Running {cfg['learner']['test_k_episodes']}")
        else:
            logger.info(f"Running until num updates == {num_updates}")
        for j in range(start_epoch, num_updates, 1):
            for step in range(cfg['learner']['num_steps']):
                obs_unpacked = {k: current_obs.peek()[k].peek() for k in current_obs.peek()}
                if j == start_epoch and step < 10:
                    log_input_images(obs_unpacked, mlog, num_stack=cfg['learner']['num_stack'],
                                     key_names=['rgb_filled', 'map'], meter_name='debug/input_images', step_num=step)

                # Sample actions
                with torch.no_grad():
                    # value, action, action_log_prob, states = actor_critic.act(
                    #     {k:v.cuda(device=cfg['training']['gpu_devices'][0]) for k, v in obs_unpacked.items()},
                    #     states.cuda(device=cfg['training']['gpu_devices'][0]),
                    #     mask_done.cuda(device=cfg['training']['gpu_devices'][0]))
                    # All should already be on training.gpu_devices[0]
                    value, action, action_log_prob, states = actor_critic.act(
                        obs_unpacked, states, mask_done, cfg['learner']['deterministic'])
                cpu_actions = list(action.squeeze(1).cpu().numpy())
                obs, reward, done, info = envs.step(cpu_actions)
                mask_done_cpu = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
                mask_done = mask_done_cpu.cuda(device=cfg['training']['gpu_devices'][0], non_blocking=True)
                reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
                episode_tracker.append(obs, cpu_actions)

                # log diagnostics
                if cfg['learner']['test']:
                    try:
                        mlog.update_meter(actor_critic.perplexity.cpu(), meters={'diagnostics/dist_perplexity'}, phase='val')
                        mlog.update_meter(actor_critic.entropy.cpu(), meters={'losses/dist_entropy'}, phase='val')
                        mlog.update_meter(value.cpu(), meters={'diagnostics/value'}, phase='val')
                    except AttributeError:
                        pass


                # Handle terminated episodes; logging values and computing the "done" mask
                episode_rewards += reward
                episode_lengths += (1 + cfg['env']['additional_repeat_count'])
                for i, (r, l, done_) in enumerate(zip(episode_rewards, episode_lengths, done)):  # Logging loop
                    if done_:
                        n_episodes_completed += 1
                        if cfg['learner']['test']:
                            info[i]['reward'] = r.item()
                            info[i]['length'] = l.item()
                            if 'debug_mode' in cfg['env']['env_specific_kwargs'] and cfg['env']['env_specific_kwargs']['debug_mode']:
                                info[i]['scene_id'] = envs.env.env.env._env.current_episode.scene_id
                                info[i]['episode_id'] = envs.env.env.env._env.current_episode.episode_id
                            all_episodes.append({
                                'info': info[i],
                                'history': episode_tracker.episodes[i][:-1]})
                        episode_tracker.clear_episode(i)
                        phase = 'train' if i < num_train_processes else 'val'
                        mlog.update_meter(r.item(), meters={'metrics/rewards'}, phase=phase)
                        mlog.update_meter(l.item(), meters={'diagnostics/lengths'}, phase=phase)
                        if 'habitat' in cfg['env']['env_name'].lower():
                            mlog.update_meter(info[i]["collisions"], meters={'metrics/collisions'}, phase=phase)
                            if scenario == 'PointNav':
                                mlog.update_meter(info[i]["spl"], meters={'metrics/spl'}, phase=phase)
                                mlog.update_meter(info[i]["success"], meters={'metrics/success'}, phase=phase)

                        # reset env then agent... note this only works for single process
                        if 'debug_mode' in cfg['env']['env_specific_kwargs'] and cfg['env']['env_specific_kwargs']['debug_mode']:
                            obs = envs.reset()
                        try:
                            actor_critic.reset(envs=envs)
                        except:
                            actor_critic.reset()
                episode_rewards *= mask_done_cpu
                episode_lengths *= mask_done_cpu

                # Insert the new observation into RolloutStorage
                current_obs.insert(obs, mask_done)
                if not cfg['learner']['test']:
                    for k in obs:
                        if k in current_train_obs.sensor_names:
                            current_train_obs[k].insert(obs[k][:num_train_processes], mask_done[:num_train_processes])
                    rollouts.insert(current_train_obs.peek(),
                                    states[:num_train_processes],
                                    action[:num_train_processes],
                                    action_log_prob[:num_train_processes],
                                    value[:num_train_processes],
                                    reward[:num_train_processes],
                                    mask_done[:num_train_processes])
                    mlog.update_meter(value[:num_train_processes].mean().item(), meters={'diagnostics/value'},
                                      phase='train')

            # Training update
            if not cfg['learner']['test']:
                if not cfg['learner']['use_replay']:
                    # Moderate compute saving optimization (if no replay buffer):
                    #     Estimate future-discounted returns only once
                    with torch.no_grad():
                        next_value = actor_critic.get_value(rollouts.observations.at(-1),
                                                            rollouts.states[-1],
                                                            rollouts.masks[-1]).detach()
                    rollouts.compute_returns(next_value, cfg['learner']['use_gae'], cfg['learner']['gamma'],
                                             cfg['learner']['tau'])
                value_loss, action_loss, dist_entropy, max_importance_weight, info = agent.update(rollouts)
                rollouts.after_update()  # For the next iter: initial obs <- current observation

                # Update meters with latest training info
                mlog.update_meter(dist_entropy, meters={'losses/dist_entropy'})
                mlog.update_meter(np.exp(dist_entropy), meters={'diagnostics/dist_perplexity'})
                mlog.update_meter(value_loss, meters={'losses/value_loss'})
                mlog.update_meter(action_loss, meters={'losses/action_loss'})
                mlog.update_meter(max_importance_weight, meters={'diagnostics/max_importance_weight'})
                if 'intrinsic_loss_types' in cfg['learner']['loss_kwargs'] and len(cfg['learner']['loss_kwargs']['intrinsic_loss_types']) > 0:
                    for iloss in cfg['learner']['loss_kwargs']['intrinsic_loss_types']:
                        mlog.update_meter(info[iloss], meters={f'losses/{iloss}'})
                try:
                    for attr in cfg['learner']['perception_network_kwargs']['extra_kwargs']['attrs_to_remember']:
                        mlog.update_meter(info[attr].cpu(), meters={f'diagnostics/{attr}'})
                except KeyError:
                    pass

                try:
                    if hasattr(actor_critic, 'module'):
                        alpha = [param for name, param in actor_critic.module.named_parameters() if 'alpha' in name][0]
                    else:
                        alpha = [param for name, param in actor_critic.named_parameters() if 'alpha' in name][0]
                    mlog.update_meter(torch.sigmoid(alpha).detach().item(), meters={f'introspect/alpha'})
                except IndexError:
                    pass

            # Main logging
            if (j) % cfg['saving']['log_interval'] == 0:
                torch.cuda.empty_cache()
                GPUtil.showUtilization()
                count_open()
                num_relevant_processes = num_val_processes if cfg['learner']['test'] else num_train_processes
                n_steps_since_logging = cfg['saving']['log_interval'] * num_relevant_processes * cfg['learner'][
                    'num_steps']
                total_num_steps = (j + 1) * num_relevant_processes * cfg['learner']['num_steps']

                logger.info("Update {}, num timesteps {}, FPS {}".format(
                    j + 1,
                    total_num_steps,
                    int(n_steps_since_logging / (time.time() - start_time))
                ))
                logger.info(f"Completed episodes: {n_episodes_completed}")
                viable_modes = ['val'] if cfg['learner']['test'] else ['train', 'val']
                for metric in core_metrics:  # Log to stdout
                    for mode in viable_modes:
                        if metric in core_metrics or mode == 'train':
                            mlog.print_meter(mode, total_num_steps, meterlist={metric})
                if not cfg['learner']['test']:
                    for mode in viable_modes:  # Log to files
                        results = mlog.peek_meter(phase=mode)
                        reward_only_flog.log(mode, {metric: results[metric] for metric in core_metrics})
                        if mode == 'train':
                            results_to_log = {}
                            results['step_num'] = j + 1
                            results_to_log['step_num'] = results['step_num']
                            for k,v in results.items():
                                if k in flog_keys_to_remove:
                                    warnings.warn(f'Removing {k} from results_log.pkl due to large size')
                                else:
                                    results_to_log[k] = v
                            flog.log('all_results', results_to_log)

                        mlog.reset_meter(total_num_steps, mode=mode)
                start_time = time.time()

            # Save checkpoint
            if not cfg['learner']['test'] and j % cfg['saving']['save_interval'] == 0:
                save_dir_absolute = os.path.join(cfg['saving']['log_dir'], cfg['saving']['save_dir'])
                save_checkpoint(
                    {'agent': agent, 'epoch': j},
                    save_dir_absolute, j)
            if 'test_k_episodes' in cfg['learner'] and n_episodes_completed >= cfg['learner']['test_k_episodes']:
                torch.save(all_episodes, os.path.join(cfg['saving']['log_dir'], 'validation.pth'))
                all_episodes = all_episodes[:cfg['learner']['test_k_episodes']]
                spl_mean = np.mean([episode['info']['spl'] for episode in all_episodes])
                success_mean = np.mean([episode['info']['success'] for episode in all_episodes])
                reward_mean = np.mean([episode['info']['reward'] for episode in all_episodes])
                logger.info('------------ done with testing -------------')
                logger.info(f'SPL: {spl_mean} --- Success: {success_mean} --- Reward: {reward_mean}')
                for metric in mlog.meter['val'].keys():
                    mlog.print_meter('val', -1, meterlist={metric})
                break

    # Clean up (either after ending normally or early [e.g. from a KeyboardInterrupt])
    finally:
        print(psutil.virtual_memory())
        GPUtil.showUtilization(all=True)
        try:
            logger.info("### Done - Killing envs.")
            if isinstance(envs, list):
                [env.close() for env in envs]
            else:
                envs.close()
            logger.info("Killed envs.")
        except UnboundLocalError:
            logger.info("No envs to kill!")
    def step1(self, action, fail):
        # --action space: {'PathPattern': 1 , 'LeftHandAct':0,'RightHandAct':0}--
        if(action['PathPattern'] < 0 or action['PathPattern'] > 3) or \
            (action['LeftHandAct'] < 0 or action['LeftHandAct'] > 2) or \
                (action['RightHandAct'] < 0 or action['RightHandAct'] > 2):
            logger.error('Action is not defined!')
            return

        self.PathPattern = action['PathPattern']

        if action['LeftHandAct'] == 0:
            self.LeftArmSpeed = self.LeftArmSpeed
        elif action['LeftHandAct'] == 1:
            self.LeftArmSpeed = self.LeftArmSpeed - 1
        elif action['LeftHandAct'] == 2:
            self.LeftArmSpeed = self.LeftArmSpeed + 1

        if action['RightHandAct'] == 0:
            self.RightArmSpeed = self.RightArmSpeed
        if action['RightHandAct'] == 1:
            self.RightArmSpeed = self.RightArmSpeed - 1
        elif action['RightHandAct'] == 2:
            self.RightArmSpeed = self.RightArmSpeed + 1

        # -----ori_obj, self.pos_x_object, self.pos_y_object = state_obj----
        state_obj = self.state_obj

        rotation_force = (self.LeftArmSpeed - self.RightArmSpeed)

        # Degree limitation for object rotation
        if (self.ori_object >= -self.rotation_limitation
                and self.ori_object <= self.rotation_limitation):
            self.ori_object = self.ori_object + rotation_force

        self.state_obj = (self.ori_object, self.pos_x_object,
                          self.pos_y_object)

        location = math.sqrt(
            math.pow(self.pos_x_object - self.Desired_pos_x, 2) +
            math.pow(self.pos_y_object - self.Desired_pos_y, 2))

        done = False
        if (location <= 5):
            done = True

        self.step_count = self.step_count + 1
        reward_obj = 0

        if fail:
            # Keyboard just fell! or a key just pressed
            reward_obj = -1
        elif not done:
            if abs(self.ori_object) < 1:
                reward_obj = 1
            elif self.prevous_step_orientation == self.ori_object:
                reward_obj = 0
            else:
                reward_obj = self.alfa * \
                    (1.0/(abs(self.prevous_step_orientation) - abs(self.ori_object)))
        elif self.steps_beyond_done_obj is None:
            self.steps_beyond_done_obj = 0
            if abs(self.ori_object) < 1:
                reward_obj = 1
            elif self.prevous_step_orientation == self.ori_object:
                reward_obj = 0
            else:
                reward_obj = self.alfa * \
                    (1.0/(abs(self.prevous_step_orientation) - abs(self.ori_object)))
                # + self.beta * (1.0 / (math.sqrt(math.pow(self.Desired_pos_x - self.pos_x_object, 2) +
                #                              math.pow(self.Desired_pos_y - self.pos_y_object, 2))) + 1)
        else:
            if self.steps_beyond_done_obj == 0:
                logger.warn(
                    "You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior."
                )
            self.steps_beyond_done_obj += 1
        self.prevous_step_orientation = self.ori_object
        return np.array(self.state_obj), (reward_obj), done, {}
Example #10
0
    def __init__(self, seq, collision_penalty=-2, trap_penalty=0.5, dp = False):
        """Initializes the lattice

        Parameters
        ----------
        seq : str, must only consist of 'H' or 'P'
            Sequence containing the polymer chain.
        collision_penalty : int, must be a negative value
            Penalty incurred when the agent made an invalid action.
            Default is -2.
        trap_penalty : float, must be between 0 and 1
            Penalty incurred when the agent is trapped. Actual value is
            computed as :code:`floor(length_of_sequence * trap_penalty)`
            Default is 0.5.

        Raises
        ------
        AssertionError
            If a certain polymer is not 'H' or 'P'
        """
        try:
            if not set(seq.upper()) <= set('HP'):
                raise ValueError("%r (%s) is an invalid sequence" % (seq, type(seq)))
            self.seq = seq.upper()
        except AttributeError:
            logger.error("%r (%s) must be of type 'str'" % (seq, type(seq)))
            raise

        try:
            if len(seq) > 100:
                raise ValueError("%r (%s) must have length <= 100" % (seq, type(seq)))
        except AttributeError:
            logger.error("%r (%s) must be of type 'str'" % (seq, type(seq)))
            raise
            
        try:
            if collision_penalty >= 0:
                raise ValueError("%r (%s) must be negative" %
                                 (collision_penalty, type(collision_penalty)))
            if not isinstance(collision_penalty, int):
                raise ValueError("%r (%s) must be of type 'int'" %
                                 (collision_penalty, type(collision_penalty)))
            self.collision_penalty = collision_penalty
        except TypeError:
            logger.error("%r (%s) must be of type 'int'" %
                         (collision_penalty, type(collision_penalty)))
            raise

        try:
            if not 0 < trap_penalty < 1:
                raise ValueError("%r (%s) must be between 0 and 1" %
                                 (trap_penalty, type(trap_penalty)))
            self.trap_penalty = trap_penalty
        except TypeError:
            logger.error("%r (%s) must be of type 'float'" %
                         (trap_penalty, type(trap_penalty)))
            raise

        self.state = OrderedDict({(0, 0, 0) : self.seq[0]})
        self.actions = []
        self.collisions = 0
        self.trapped = 0

        # Grid attributes
        self.grid_length = 51 #Maximum seq length 25
        self.midpoint = (25, 25, 25)
        self.grid = np.zeros(shape=(self.grid_length, self.grid_length, self.grid_length), dtype=int)

        # Automatically assign first element into grid
        self.grid[self.midpoint] = POLY_TO_INT[self.seq[0]]

        # Define action-observation spaces
        self.action_space = spaces.Discrete(6)
        self.observation_space = spaces.Box(low=-1, high=1,
                                            shape=(self.grid_length * self.grid_length * self.grid_length,),
                                            dtype=int)
        self.last_action = None
        
        if dp:
            # For DP Algorithms
            # P represents the transition probabilities of the environment
            # P[s][a] is a tuple (next_state, reward, done)
            # nS is the number of states
            # nA is the number of actions
            # Denote states by the actions taken to get there (left, straight, up)
            # Encode them as ternary numbers
            # Assume the first step is left
            self.nS = int((5**(len(self.seq)- 1) + 1) / 2)
            self.nA = 5
            self.P = [[(0, 0, False) for i in range(self.nA)] for j in range(self.nS)]

            self.states_dic = {}
            self.fill_P()
 def __init__(self, p, collision_penalty= -2, trap_penalty= 0.5):
     """Initializes the lattice
         
         
         Parameters
         ----------
         p : str, must only consist of an array of integers.
         Sequence containing the maximum length of each interpolator.
         collision_penalty : int, must be a negative value
         Penalty incurred when the agent made an invalid action.
         Default is -2.
         trap_penalty : float, must be between 0 and 1
         Penalty incurred when the agent is trapped. Actual value is
         computed as :code:`floor(length_of_sequence * trap_penalty)`
         Default is -2.
         """
     
     try:
         if collision_penalty >= 0:
             raise ValueError("%r (%s) must be negative" %
                              (collision_penalty, type(collision_penalty)))
         if not isinstance(collision_penalty, int):
             raise ValueError("%r (%s) must be of type 'int'" %
                              (collision_penalty, type(collision_penalty)))
         self.collision_penalty = collision_penalty
     except TypeError:
         logger.error("%r (%s) must be of type 'int'" %
                      (collision_penalty, type(collision_penalty)))
         raise
     
     
     try:
         if not 0 < trap_penalty < 1:
             raise ValueError("%r (%s) must be between 0 and 1" %
                              (trap_penalty, type(trap_penalty)))
         self.trap_penalty = trap_penalty
     except TypeError:
         logger.error("%r (%s) must be of type 'float'" %
                      (trap_penalty, type(trap_penalty)))
         raise
     
     
     self.state = [(0, 0)]
     self.master_state = [(0,0)]
     self.actions = []
     self.origin = (0,0)
     self.op_counts = 0
     self.is_looped = False
     
     
     
     # here P is an array with number of Xs allowed for each operator
     self.p = p
     
     self.seq = ['x'] * p[self.op_counts]
     
     # Grid attributes
     self.grid_length = 2 * len(self.seq) + 1
     self.midpoint = (len(self.seq), len(self.seq))
     self.grid = np.zeros(shape=(self.grid_length, self.grid_length), dtype=int)
     
     
     # Automatically assign first element into grid
     self.grid[self.midpoint] = POLY_TO_INT[self.seq[0]]
     
     
     # Define action-observation spaces
     self.action_space = spaces.Discrete(5)
     
     self.observation_space = spaces.Box(low=-2, high=1,
                                         shape=(self.grid_length, self.grid_length),
                                         dtype=int)
     self.last_action = None
                self.master_state.append(next_move)
                
                # Checking for loops
                #########################################
                start_pt = self.state.index(self.origin)
                new_loop = self.state[start_pt:]
                
                self.is_looped =  (4 <= len(new_loop) <= self.p[self.op_counts]) and \
                (self.state[len(self.state)-1] in \
                 self._get_adjacent_coords(self.origin).values())
            #########################################
            
            
            except IndexError:
                
                logger.error('All sites have been passed! Nowhere left to go!')
                
                out_of_xs = True


grid = self._draw_grid(self.state)

done  = self.op_counts == (len(self.p))
    
    if not done and (action ==4) : self.change_op()
    
    reward = self._compute_reward(is_trapped, is_collided, done,\
                                  failed_jump, succ_jump, out_of_xs)
        
        info = {
            'chain_length' : len(self.state),
Example #13
0
 def close(self):
     self.proc.stdin.close()
     ret = self.proc.wait()
     if ret != 0:
         logger.error("VideoRecorder encoder exited with status {}".format(ret))