コード例 #1
0
    def forward_pass(device_type):
        env_name = 'atari_breakout'
        cfg = default_cfg(algo='appooc', env=env_name)
        cfg.actor_critic_share_weights = True
        cfg.hidden_size = 128
        cfg.use_rnn = True
        cfg.env_framestack = 4

        env = create_env(env_name, cfg=cfg)

        torch.set_num_threads(1)
        torch.backends.cudnn.benchmark = True

        actor_critic = create_actor_critic(cfg, env.observation_space, env.action_space)
        device = torch.device(device_type)
        actor_critic.to(device)

        timing = Timing()
        with timing.timeit('all'):
            batch = 128
            with timing.add_time('input'):
                # better avoid hardcoding here...
                observations = dict(obs=torch.rand([batch, 4, 84, 84]).to(device))
                rnn_states = torch.rand([batch, get_hidden_size(cfg)]).to(device)

            n = 200
            for i in range(n):
                with timing.add_time('forward'):
                    output = actor_critic(observations, rnn_states)

                log.debug('Progress %d/%d', i, n)

        log.debug('Timing: %s', timing)
コード例 #2
0
def test_multi_env_performance(test, env_type, num_envs, num_workers):
    t = Timing()
    with t.timeit('init'):
        multi_env = MultiEnv(num_envs,
                             num_workers,
                             test.make_env,
                             stats_episodes=100)
        total_num_frames, frames = 20000, 0

    with t.timeit('first_reset'):
        multi_env.reset()

    next_print = print_step = 10000
    with t.timeit('experience'):
        while frames < total_num_frames:
            _, _, done, info = multi_env.step([0] * num_envs)
            frames += num_env_steps(info)
            if frames > next_print:
                log.info('Collected %d frames of experience...', frames)
                next_print += print_step

    fps = total_num_frames / t.experience
    log.debug('%s performance:', env_type)
    log.debug('Took %.3f sec to collect %d frames in parallel, %.1f FPS',
              t.experience, total_num_frames, fps)
    log.debug('Timing: %s', t)

    multi_env.close()
コード例 #3
0
def evaluate_locomotion():
    experiments = (
        # ('doom_textured_super_sparse', 'doom_textured_super_sparse-tmax_v035-64filt'),
        # ('doom_textured_super_sparse', 'doom_textured_super_sparse-tmax_v035-gamma-0998'),
        # ('doom_maze_no_goal', 'doom_maze_no_goal-tmax_v035_dist_expl'),
        # ('doom_maze_no_goal', 'doom_maze_no_goal-tmax_v035_no_spars'),

        ('doom_textured_super_sparse_v2', 'doom_textured_super_sparse_v2_trajectory'),
    )

    t = Timing()

    with t.timeit('evaluation'):
        results = {}
        for experiment in experiments:
            env_id, exp_name = experiment
            rate, speed = evaluate_experiment(env_id, exp_name)
            results[exp_name] = (rate, speed)

    log.info('Evaluation completed, took %s', t)
    rates, speeds = [], []
    for exp_name, r in results.items():
        rate, speed = r
        log.info('%s: success_rate: %.1f%%, avg_speed %.3f', exp_name, rate * 100, speed)
        rates.append(rate)
        speeds.append(speed)

    log.info('Average across experiments: success %.1f%%, speed: %.3f', np.mean(rates) * 100, np.mean(speeds))

    return 0
コード例 #4
0
    def add_trajectory_to_dense_map(self, existing_map, traj):
        t = Timing()

        m = existing_map
        m.new_episode()  # just in case

        node_idx = [-1] * len(
            traj)  # index map from trajectory frame to graph node idx
        node_idx[
            0] = 0  # first observation is always the same (we start from the same initial state)

        with t.timeit('create_initial_map'):
            self._add_simple_path_to_map(m, traj, node_idx)

        # precalculate feature vectors for the distance network
        with t.timeit('cache_feature_vectors'):
            all_observations = [
                m.get_observation(node) for node in m.graph.nodes
            ]
            obs_embeddings = self._calc_embeddings(all_observations)

        # with t.add_time('pairwise_distances'):
        #     pairwise_distances = self._calc_pairwise_distances(obs_embeddings)

        # TODO: so far no shortcuts
        # with t.timeit('loop_closures'):
        #     self._add_shortcuts(m, pairwise_distances)

        log.debug('Add trajectory to map, timing: %s', t)
        return m
コード例 #5
0
    def test_gumbel_trick(self):
        """
        We use a Gumbel noise which seems to be faster compared to using pytorch multinomial.
        Here we test that those are actually equivalent.
        """

        timing = Timing()

        torch.backends.cudnn.enabled = True
        torch.backends.cudnn.benchmark = True

        with torch.no_grad():
            action_space = gym.spaces.Discrete(8)
            num_logits = calc_num_logits(action_space)
            device_type = 'cpu'
            device = torch.device(device_type)
            logits = torch.rand(self.batch_size, num_logits,
                                device=device) * 10.0 - 5.0

            if device_type == 'cuda':
                torch.cuda.synchronize(device)

            count_gumbel, count_multinomial = np.zeros(
                [action_space.n]), np.zeros([action_space.n])

            # estimate probability mass by actually sampling both ways
            num_samples = 20000

            action_distribution = get_action_distribution(action_space, logits)
            sample_actions_log_probs(action_distribution)
            action_distribution.sample_gumbel()

            with timing.add_time('gumbel'):
                for i in range(num_samples):
                    action_distribution = get_action_distribution(
                        action_space, logits)
                    samples_gumbel = action_distribution.sample_gumbel()
                    count_gumbel[samples_gumbel[0]] += 1

            action_distribution = get_action_distribution(action_space, logits)
            action_distribution.sample()

            with timing.add_time('multinomial'):
                for i in range(num_samples):
                    action_distribution = get_action_distribution(
                        action_space, logits)
                    samples_multinomial = action_distribution.sample()
                    count_multinomial[samples_multinomial[0]] += 1

            estimated_probs_gumbel = count_gumbel / float(num_samples)
            estimated_probs_multinomial = count_multinomial / float(
                num_samples)

            log.debug('Gumbel estimated probs: %r', estimated_probs_gumbel)
            log.debug('Multinomial estimated probs: %r',
                      estimated_probs_multinomial)
            log.debug('Sampling timing: %s', timing)
            time.sleep(0.1)  # to finish logging
コード例 #6
0
def record_trajectory(params, env_id):
    def make_env_func():
        e = create_env(env_id, skip_frames=True)
        e.seed(0)
        return e

    env = make_env_func()
    map_img, coord_limits = generate_env_map(make_env_func)

    env_obs, info = reset_with_info(env)
    obs = main_observation(env_obs)
    done = False

    m = TopologicalMap(obs,
                       directed_graph=False,
                       initial_info=info,
                       verbose=True)

    trajectory = Trajectory(env_idx=-1)
    frame = 0

    t = Timing()

    while not done and not terminate:
        with t.timeit('one_frame'):
            env.render()

            if len(current_actions) > 0:
                action = current_actions[-1]
            else:
                action = 0

            trajectory.add(obs, action, info)
            m.add_landmark(obs, info, update_curr_landmark=True)

            env_obs, rew, done, info = env.step(action)
            obs = main_observation(env_obs)

        took_seconds = t.one_frame
        desired_fps = 15
        wait_seconds = (1.0 / desired_fps) - took_seconds
        wait_seconds = max(0.0, wait_seconds)
        time.sleep(wait_seconds)

        frame += 1

    env.render()
    time.sleep(0.2)

    trajectory_dir = trajectory.save(params.experiment_dir())
    m.save_checkpoint(trajectory_dir,
                      map_img=map_img,
                      coord_limits=coord_limits,
                      verbose=True)

    env.close()
    return 0
コード例 #7
0
    def _learn_loop(self, multi_env):
        """Main training loop."""
        step, env_steps = self.session.run([self.actor_step, self.total_env_steps])

        env_obs = multi_env.reset()
        observations, goals = main_observation(env_obs), goal_observation(env_obs)
        buffer = PPOBuffer()

        def end_of_training(s, es):
            return s >= self.params.train_for_steps or es > self.params.train_for_env_steps

        while not end_of_training(step, env_steps):
            timing = Timing()
            num_steps = 0
            batch_start = time.time()

            buffer.reset()

            with timing.timeit('experience'):
                # collecting experience
                for rollout_step in range(self.params.rollout):
                    actions, action_probs, values = self.actor_critic.invoke(self.session, observations, goals=goals)

                    # wait for all the workers to complete an environment step
                    env_obs, rewards, dones, infos = multi_env.step(actions)
                    self.process_infos(infos)
                    new_observations, new_goals = main_observation(env_obs), goal_observation(env_obs)

                    # add experience from all environments to the current buffer
                    buffer.add(observations, actions, action_probs, rewards, dones, values, goals)
                    observations = new_observations
                    goals = new_goals

                    num_steps += num_env_steps(infos)

                # last step values are required for TD-return calculation
                _, _, values = self.actor_critic.invoke(self.session, observations, goals=goals)
                buffer.values.append(values)

            env_steps += num_steps

            # calculate discounted returns and GAE
            buffer.finalize_batch(self.params.gamma, self.params.gae_lambda)

            # update actor and critic
            with timing.timeit('train'):
                step = self._train(buffer, env_steps)

            avg_reward = multi_env.calc_avg_rewards(n=self.params.stats_episodes)
            avg_length = multi_env.calc_avg_episode_lengths(n=self.params.stats_episodes)
            fps = num_steps / (time.time() - batch_start)

            self._maybe_print(step, env_steps, avg_reward, avg_length, fps, timing)
            self._maybe_aux_summaries(env_steps, avg_reward, avg_length, fps)
            self._maybe_update_avg_reward(avg_reward, multi_env.stats_num_episodes())
            self._maybe_coverage_summaries(env_steps)
コード例 #8
0
    def test_dist_training(self):
        t = Timing()

        def make_env():
            return make_doom_env(doom_env_by_name(TEST_ENV_NAME))

        params = AgentTMAX.Params('__test_dist_train__')
        params.distance_target_buffer_size = 1000

        with t.timeit('generate_data'):
            # first: generate fake random data
            buffer = Buffer()

            obs1 = np.full([84, 84, 3], 0, dtype=np.uint8)
            obs1[:, :, 1] = 255
            obs2 = np.full([84, 84, 3], 0, dtype=np.uint8)
            obs2[:, :, 2] = 255

            data_size = params.distance_target_buffer_size
            for i in range(data_size):
                same = i % 2 == 0
                if same:
                    if random.random() < 0.5:
                        obs_first = obs_second = obs1
                    else:
                        obs_first = obs_second = obs2
                else:
                    obs_first, obs_second = obs1, obs2
                    if random.random() < 0.5:
                        obs_second, obs_first = obs_first, obs_second

                buffer.add(obs_first=obs_first,
                           obs_second=obs_second,
                           labels=0 if same else 1)

        with t.timeit('init'):
            agent = AgentTMAX(make_env, params)
            agent.initialize()

            params.distance_train_epochs = 1
            params.distance_batch_size = 256
            agent.distance.train(buffer, 1, agent)

        with t.timeit('train'):
            params.distance_train_epochs = 2
            params.distance_batch_size = 64
            agent.distance.train(buffer, 1, agent, t)

        agent.finalize()

        log.info('Timing: %s', t)
        shutil.rmtree(params.experiment_dir())
コード例 #9
0
    def episodic_memory_summary(self, env_steps, summary_writer, **kwargs):
        t = Timing()

        with t.timeit('ecr_memory'):
            time_since_last = time.time() - self._last_map_summary
            map_summary_rate_seconds = 120
            if time_since_last <= map_summary_rate_seconds:
                return
            if self.episodic_memories is None:
                return

            env_to_plot = 0
            for env_i, memory in enumerate(self.episodic_memories):
                if len(memory) > len(self.episodic_memories[env_to_plot]):
                    env_to_plot = env_i

            log.info('Visualizing episodic memory for env %d', env_to_plot)
            memory_to_plot = self.episodic_memories[env_to_plot]

            if len(memory_to_plot) <= 0:
                return

            landmark_indices = sorted(memory_to_plot.landmarks.keys())

            m = TopologicalMap(
                memory_to_plot.landmarks[landmark_indices[0]].embedding,
                directed_graph=False,
                initial_info=memory_to_plot.landmarks[
                    landmark_indices[0]].info,
            )

            for lm_idx in landmark_indices[1:]:
                info = memory_to_plot.landmarks[lm_idx].info
                # noinspection PyProtectedMember
                m._add_new_node(
                    obs=memory_to_plot.landmarks[lm_idx].embedding,
                    pos=get_position(info),
                    angle=get_angle(info),
                )

            map_img = kwargs.get('map_img')
            coord_limits = kwargs.get('coord_limits')
            map_summaries([m],
                          env_steps,
                          summary_writer,
                          'ecr',
                          map_img,
                          coord_limits,
                          is_sparse=True)

            self._last_map_summary = time.time()
        log.info('Took %s', t)
コード例 #10
0
    def test_buffer_performance(self):
        small_buffer = Buffer()
        small_buffer.add_many(obs=np.zeros([1000, 84, 84, 3], dtype=np.uint8))

        buffer = Buffer()

        t = Timing()

        with t.timeit('add'):
            for i in range(100):
                buffer.add_buff(small_buffer)

        huge_buffer = Buffer()
        with t.timeit('add_huge'):
            huge_buffer.add_buff(buffer)
            huge_buffer.add_buff(buffer)

        with t.timeit('single_add_small'):
            huge_buffer.add_buff(small_buffer)

        with t.timeit('clear_and_add'):
            huge_buffer.clear()
            huge_buffer.add_buff(buffer)
            huge_buffer.add_buff(buffer)

        with t.timeit('shuffle_and_add'):
            huge_buffer.clear()
            huge_buffer.add_buff(buffer)
            huge_buffer.add_buff(small_buffer)
            with t.timeit('shuffle'):
                huge_buffer.shuffle_data()

        log.debug('Timing: %s', t)
コード例 #11
0
    def calc_test_error(self, buffer, env_steps, agent, timing=None):
        log.info('Calculating distance net test error...')

        if timing is None:
            timing = Timing()

        params = agent.params
        batch_size = params.distance_batch_size
        dist_step = self.step.eval(session=agent.session)

        with timing.timeit('dist_test_error'):
            losses = []
            obs_first, obs_second, labels = buffer.obs_first, buffer.obs_second, buffer.labels

            for i in range(0, len(obs_first) - 1, batch_size):
                start, end = i, i + batch_size

                loss = agent.session.run(self.loss,
                                         feed_dict={
                                             self.ph_obs_first:
                                             obs_first[start:end],
                                             self.ph_obs_second:
                                             obs_second[start:end],
                                             self.ph_labels:
                                             labels[start:end],
                                             self.ph_is_training:
                                             False,
                                         })

                losses.append(loss)

            avg_loss = np.mean(losses)
            log.info('Avg loss at %d steps is %.3f', dist_step, avg_loss)

            summary_obj_env_steps = tf.Summary()
            summary_obj_env_steps.value.add(tag='distance/test_loss_env_steps',
                                            simple_value=avg_loss)
            agent.summary_writer.add_summary(summary_obj_env_steps, env_steps)

            summary_obj_training_steps = tf.Summary()
            summary_obj_training_steps.value.add(
                tag='distance/test_loss_train_steps', simple_value=avg_loss)
            agent.summary_writer.add_summary(summary_obj_training_steps,
                                             dist_step)

            agent.summary_writer.flush()

        log.debug('Took %s', timing)
コード例 #12
0
    def test_quad_env(self):
        self.assertIsNotNone(create_env('quadrotor_single'))

        env = create_env('quadrotor_single')
        obs = env.reset()

        n_frames = 10000

        timing = Timing()
        with timing.timeit('step'):
            for i in range(n_frames):
                obs, r, d, info = env.step(env.action_space.sample())
                if d:
                    env.reset()

        log.debug('Time %s, FPS %.1f', timing, n_frames / timing.step)
コード例 #13
0
    def test_quad_multi_env(self):
        env_name = 'quadrotor_multi'
        cfg = default_cfg(env=env_name)
        self.assertIsNotNone(create_env(env_name, cfg=cfg))

        env = create_env(env_name, cfg=cfg)
        env.reset()

        n_frames = 1000

        timing = Timing()
        with timing.timeit('step'):
            for i in range(n_frames):
                obs, r, d, info = env.step(
                    [env.action_space.sample() for _ in range(env.num_agents)])

        log.debug('Time %s, FPS %.1f', timing, n_frames / timing.step)
コード例 #14
0
def test_env_performance(make_env, env_type, verbose=False):
    t = Timing()
    with t.timeit('init'):
        env = make_env(AttrDict({'worker_index': 0, 'vector_index': 0}))
        total_num_frames, frames = 10000, 0

    with t.timeit('first_reset'):
        env.reset()

    t.reset = t.step = 1e-9
    num_resets = 0
    with t.timeit('experience'):
        while frames < total_num_frames:
            done = False

            start_reset = time.time()
            env.reset()

            t.reset += time.time() - start_reset
            num_resets += 1

            while not done and frames < total_num_frames:
                start_step = time.time()
                if verbose:
                    env.render()
                    time.sleep(1.0 / 40)

                obs, rew, done, info = env.step(env.action_space.sample())
                if verbose:
                    log.info('Received reward %.3f', rew)

                t.step += time.time() - start_step
                frames += num_env_steps([info])

    fps = total_num_frames / t.experience
    log.debug('%s performance:', env_type)
    log.debug('Took %.3f sec to collect %d frames on one CPU, %.1f FPS',
              t.experience,
              total_num_frames,
              fps)
    log.debug('Avg. reset time %.3f s', t.reset / num_resets)
    log.debug('Timing: %s', t)
    env.close()
コード例 #15
0
ファイル: learner.py プロジェクト: erikwijmans/sample-factory
    def _train_loop(self):
        timing = Timing()
        self.initialize(timing)

        wait_times = deque([], maxlen=self.cfg.num_workers)
        last_cache_cleanup = time.time()
        num_batches_processed = 0

        while not self.terminate:
            with timing.timeit('train_wait'):
                data = safe_get(self.experience_buffer_queue)

            if self.terminate:
                break

            wait_stats = None
            wait_times.append(timing.train_wait)

            if len(wait_times) >= wait_times.maxlen:
                wait_times_arr = np.asarray(wait_times)
                wait_avg = np.mean(wait_times_arr)
                wait_min, wait_max = wait_times_arr.min(), wait_times_arr.max()
                # log.debug(
                #     'Training thread had to wait %.5f s for the new experience buffer (avg %.5f)',
                #     timing.train_wait, wait_avg,
                # )
                wait_stats = (wait_avg, wait_min, wait_max)

            self._process_training_data(data, timing, wait_stats)
            num_batches_processed += 1

            if time.time() - last_cache_cleanup > 300.0 or (
                    not self.cfg.benchmark and num_batches_processed < 50):
                if self.cfg.device == 'gpu':
                    torch.cuda.empty_cache()
                    torch.cuda.ipc_collect()
                last_cache_cleanup = time.time()

        time.sleep(0.3)
        log.info('Train loop timing: %s', timing)
        del self.actor_critic
        del self.device
コード例 #16
0
def run_multi_quadrotor_env(env_name, cfg):
    env = create_env(env_name, cfg=cfg)
    env.reset()
    for i in range(100):
        obs, r, d, info = env.step(
            [env.action_space.sample() for _ in range(env.num_agents)])

    n_frames = 1000
    env = create_env(env_name, cfg=cfg)
    env.reset()

    timing = Timing()
    with timing.timeit('step'):
        for i in range(n_frames):
            obs, r, d, info = env.step(
                [env.action_space.sample() for _ in range(env.num_agents)])

    log.debug('Time %s, FPS %.1f', timing,
              n_frames * env.num_agents / timing.step)
    env.close()
コード例 #17
0
ファイル: model.py プロジェクト: neevparikh/sample-factory
def create_dqn(cfg, obs_space, action_space, timing=None):
    if timing is None:
        timing = Timing()

    def make_encoder():
        return create_encoder(cfg, obs_space, timing)

    def make_core(encoder):
        return create_core(cfg, encoder.get_encoder_out_size())

    main = _SimpleDQN(make_encoder, make_core, action_space, cfg, timing)
    target = _SimpleDQN(make_encoder, make_core, action_space, cfg, timing)
    return _DQN(main, target)
コード例 #18
0
def create_actor_critic(cfg, obs_space, action_space, timing=None):
    if timing is None:
        timing = Timing()

    def make_encoder():
        return create_encoder(cfg, obs_space, timing)

    def make_core(encoder):
        return create_core(cfg, encoder.get_encoder_out_size())

    if cfg.actor_critic_share_weights:
        return _ActorCriticSharedWeights(make_encoder, make_core, action_space, cfg, timing)
    else:
        return _ActorCriticSeparateWeights(make_encoder, make_core, action_space, cfg, timing)
コード例 #19
0
 def __init__(self, args):
     """
     This initialises our main class, it expects
     the arguments to be passed in.
     """
     # Initialise the logger used throughout the whole script
     self.logger = init_logging()
     # String to print in case a command finishes successfully
     self.success_symbol = u"\u2713"
     # String to print in case a command fails
     self.failure_symbol = u"\u2717"
     # The passed arguments
     self.import_file = args.import_file if "import_file" in args else ""
     self.dry_run = args.dry_run if "dry_run" in args else False
     self.compare = args.compare if "compare" in args else False
     self.list_add = args.list_add if "list_add" in args else None
     # The mode to operate in
     self.mode = MODE_BATCH if "import_file" in args else MODE_INTERACTIVE
     # The Marionette instance, wrapped by our own helper class
     self.marionette = MarionetteHelper(self.logger, self.success_symbol,
                                        self.failure_symbol)
     # A set of existing bookmarks to check later if a bookmark was already saved
     self.bookmarks = set()
     # Initialise Google Maps API
     try:
         path = os.path.dirname(os.path.realpath(__file__))
         key_file = "gm-api-key.json"
         with open("{}/{}".format(path, key_file), "r") as f:
             data = json.load(f)
             self.gm = googlemaps.Client(key=data["key"])
     except IOError:
         self.logger.error(
             u" > [ERROR] Unable to open '{}', Google Maps API disabled {}".
             format(key_file, self.failure_symbol))
         self.gm = None
     # Initialise timing
     self.timing = Timing(self.logger)
コード例 #20
0
def test_env_performance(test, env_type):
    t = Timing()
    with t.timeit('init'):
        env = test.make_env()
        total_num_frames, frames = 4000, 0
        agent = AgentRandom(test.make_env, {})

    with t.timeit('first_reset'):
        env.reset()

    t.reset = t.step = 1e-9
    num_resets = 0
    with t.timeit('experience'):
        while frames < total_num_frames:
            done = False

            start_reset = time.time()
            env.reset()

            t.reset += time.time() - start_reset
            num_resets += 1

            while not done and frames < total_num_frames:
                start_step = time.time()
                obs, rew, done, info = env.step(agent.best_action())
                t.step += time.time() - start_step
                frames += num_env_steps([info])

    fps = total_num_frames / t.experience
    log.debug('%s performance:', env_type)
    log.debug('Took %.3f sec to collect %d frames on one CPU, %.1f FPS',
              t.experience, total_num_frames, fps)
    log.debug('Avg. reset time %.3f s', t.reset / num_resets)
    log.debug('Timing: %s', t)

    env.close()
コード例 #21
0
    def _run(self):
        """
        Main loop of the actor worker (rollout worker).
        Process tasks (mainly ROLLOUT_STEP) until we get the termination signal, which usually means end of training.
        Currently there is no mechanism to restart dead workers if something bad happens during training. We can only
        retry on the initial reset(). This is definitely something to work on.
        """
        log.info('Initializing vector env runner %d...', self.worker_idx)

        # workers should ignore Ctrl+C because the termination is handled in the event loop by a special msg
        signal.signal(signal.SIGINT, signal.SIG_IGN)

        torch.multiprocessing.set_sharing_strategy('file_system')

        timing = Timing()

        last_report = time.time()
        with torch.no_grad():
            while not self.terminate:
                try:
                    try:
                        with timing.add_time('waiting'), timing.timeit('wait_actor'):
                            tasks = self.task_queue.get_many(timeout=0.1)
                    except Empty:
                        tasks = []

                    for task in tasks:
                        task_type, data = task

                        if task_type == TaskType.INIT:
                            self._init()
                            continue

                        if task_type == TaskType.TERMINATE:
                            self._terminate()
                            break

                        # handling actual workload
                        if task_type == TaskType.ROLLOUT_STEP:
                            if 'work' not in timing:
                                timing.waiting = 0  # measure waiting only after real work has started

                            with timing.add_time('work'), timing.timeit('one_step'):
                                self._advance_rollouts(data, timing)
                        elif task_type == TaskType.RESET:
                            with timing.add_time('reset'):
                                self._handle_reset()
                        elif task_type == TaskType.PBT:
                            self._process_pbt_task(data)

                    if time.time() - last_report > 5.0 and 'one_step' in timing:
                        timing_stats = dict(wait_actor=timing.wait_actor, step_actor=timing.one_step)
                        memory_mb = memory_consumption_mb()
                        stats = dict(memory_actor=memory_mb)
                        self.report_queue.put(dict(timing=timing_stats, stats=stats))
                        last_report = time.time()

                except RuntimeError as exc:
                    log.warning('Error while processing data w: %d, exception: %s', self.worker_idx, exc)
                    log.warning('Terminate process...')
                    self.terminate = True
                    self.report_queue.put(dict(critical_error=self.worker_idx))
                except KeyboardInterrupt:
                    self.terminate = True
                except:
                    log.exception('Unknown exception in rollout worker')
                    self.terminate = True

        if self.worker_idx <= 1:
            time.sleep(0.1)
            log.info(
                'Env runner %d, CPU aff. %r, rollouts %d: timing %s',
                self.worker_idx, psutil.Process().cpu_affinity(), self.num_complete_rollouts, timing,
            )
コード例 #22
0
ファイル: appo.py プロジェクト: gitter-badger/sample-factory
    def run(self):
        """
        This function contains the main loop of the algorithm, as well as initialization/cleanup code.

        :return: ExperimentStatus (SUCCESS, FAILURE, INTERRUPTED). Useful in testing.
        """

        status = ExperimentStatus.SUCCESS

        if os.path.isfile(done_filename(self.cfg)):
            log.warning(
                'Training already finished! Remove "done" file to continue training'
            )
            return status

        self.init_workers()
        self.init_pbt()
        self.finish_initialization()

        log.info('Collecting experience...')

        timing = Timing()
        with timing.timeit('experience'):
            # noinspection PyBroadException
            try:
                while not self._should_end_training():
                    try:
                        reports = self.report_queue.get_many(timeout=0.1)
                        for report in reports:
                            self.process_report(report)
                    except Empty:
                        pass

                    if time.time() - self.last_report > self.report_interval:
                        self.report()

                        now = time.time()
                        self.total_train_seconds += now - self.last_report
                        self.last_report = now

                    self.pbt.update(self.env_steps, self.policy_avg_stats)

            except Exception:
                log.exception('Exception in driver loop')
                status = ExperimentStatus.FAILURE
            except KeyboardInterrupt:
                log.warning(
                    'Keyboard interrupt detected in driver loop, exiting...')
                status = ExperimentStatus.INTERRUPTED

        for learner in self.learner_workers.values():
            # timeout is needed here because some environments may crash on KeyboardInterrupt (e.g. VizDoom)
            # Therefore the learner train loop will never do another iteration and will never save the model.
            # This is not an issue with normal exit, e.g. due to desired number of frames reached.
            learner.save_model(timeout=5.0)

        all_workers = self.actor_workers
        for workers in self.policy_workers.values():
            all_workers.extend(workers)
        all_workers.extend(self.learner_workers.values())

        child_processes = list_child_processes()

        time.sleep(0.1)
        log.debug('Closing workers...')
        for i, w in enumerate(all_workers):
            w.close()
            time.sleep(0.01)
        for i, w in enumerate(all_workers):
            w.join()
        log.debug('Workers joined!')

        # VizDoom processes often refuse to die for an unidentified reason, so we're force killing them with a hack
        kill_processes(child_processes)

        fps = self.total_env_steps_since_resume / timing.experience
        log.info('Collected %r, FPS: %.1f', self.env_steps, fps)
        log.info('Timing: %s', timing)

        if self._should_end_training():
            with open(done_filename(self.cfg), 'w') as fobj:
                fobj.write(f'{self.env_steps}')

        time.sleep(0.5)
        log.info('Done!')

        return status
コード例 #23
0
def evaluate_locomotion_agent(agent, multi_env):
    num_envs = multi_env.num_envs

    observations = main_observation(multi_env.reset())
    obs_prev = observations
    infos = multi_env.info()

    agent.tmax_mgr.initialize(observations, infos, 1)
    m = agent.tmax_mgr.dense_persistent_maps[-1]

    navigator = Navigator(agent)
    for env_i in range(num_envs):
        navigator.reset(env_i, m)

    # sample final goals
    all_targets = list(m.graph.nodes)
    if len(all_targets) > 0:
        all_targets.remove(0)

    final_goal_idx = random.sample(all_targets, num_envs)
    log.info('Goals: %r', final_goal_idx)

    # noinspection PyProtectedMember
    navigator._ensure_paths_to_goal_calculated([m] * num_envs, final_goal_idx)
    path_lengths = [0] * num_envs
    for env_i in range(num_envs):
        location, path_length = 0, 0
        while location != final_goal_idx[env_i]:
            location = navigator.paths[env_i][location]
            path_length += 1
        path_lengths[env_i] = path_length

    frames = 0
    next_target, next_target_d = navigator.get_next_target(
        [m] * num_envs, observations, final_goal_idx, [frames] * num_envs,
    )
    next_target_obs = [m.get_observation(t) for t in next_target]

    avg_speed = [-1] * num_envs
    success = [False] * num_envs

    t = Timing()
    while True:
        with t.timeit('frame'):
            with t.timeit('policy'):
                actions = policy_step(agent, obs_prev, observations, next_target_obs, final_goal_idx)

            with t.timeit('step'):
                env_obs, rew, done, info = multi_env.step(actions)

            obs_prev = observations
            observations = main_observation(env_obs)

            with t.timeit('navigator'):
                next_target, next_target_d = navigator.get_next_target(
                    [m] * num_envs, observations, final_goal_idx, [frames] * num_envs,
                )

            for env_i in range(num_envs):
                if final_goal_idx[env_i] is None:
                    continue

                if next_target[env_i] is None:
                    log.warning(
                        'Agent %d got lost in %d steps trying to reach %d', env_i, frames, final_goal_idx[env_i],
                    )
                    final_goal_idx[env_i] = None
                else:
                    if next_target[env_i] == final_goal_idx[env_i] and next_target_d[env_i] < 0.1:
                        success[env_i] = True
                        avg_speed[env_i] = path_lengths[env_i] / (frames + 1)
                        log.debug(
                            'Agent %d reached goal %d in %d steps, avg. speed %.3f',
                            env_i, final_goal_idx[env_i], frames, avg_speed[env_i],
                        )
                        final_goal_idx[env_i] = None

                    next_target_obs[env_i] = m.get_observation(next_target[env_i])

            frames += 1
            if frames > 5000:
                log.error('Timeout! 5000 frames was not enough to finish locomotion!')
                break

        finished = [g is None for g in final_goal_idx]
        if all(finished):
            log.info('Done!')
            break
        else:
            if frames % 10 == 0:
                frame_repeat = 4
                fps = (1.0 / t.frame) * frame_repeat * num_envs
                log.info('%d agents remaining, fps %.3f, time %s', num_envs - sum(finished), fps, t)

    return success, avg_speed
コード例 #24
0
    def sample(self, proc_idx):
        # workers should ignore Ctrl+C because the termination is handled in the event loop by a special msg
        signal.signal(signal.SIGINT, signal.SIG_IGN)

        timing = Timing()

        from threadpoolctl import threadpool_limits
        with threadpool_limits(limits=1, user_api=None):
            if self.cfg.set_workers_cpu_affinity:
                set_process_cpu_affinity(proc_idx, self.cfg.num_workers)

            initial_cpu_affinity = psutil.Process().cpu_affinity(
            ) if platform != 'darwin' else None
            psutil.Process().nice(10)

            with timing.timeit('env_init'):
                envs = []
                env_key = ['env' for _ in range(self.cfg.num_envs_per_worker)]

                for env_idx in range(self.cfg.num_envs_per_worker):
                    global_env_id = proc_idx * self.cfg.num_envs_per_worker + env_idx
                    env_config = AttrDict(worker_index=proc_idx,
                                          vector_index=env_idx,
                                          env_id=global_env_id)
                    env = create_env(self.cfg.env,
                                     cfg=self.cfg,
                                     env_config=env_config)
                    log.debug(
                        'CPU affinity after create_env: %r',
                        psutil.Process().cpu_affinity()
                        if platform != 'darwin' else 'MacOS - None')
                    env.seed(global_env_id)
                    envs.append(env)

                    # this is to track the performance for individual DMLab levels
                    if hasattr(env.unwrapped, 'level_name'):
                        env_key[env_idx] = env.unwrapped.level_name

                episode_length = [0 for _ in envs]
                episode_lengths = [deque([], maxlen=20) for _ in envs]

            try:
                with timing.timeit('first_reset'):
                    for env_idx, env in enumerate(envs):
                        env.reset()
                        log.info('Process %d finished resetting %d/%d envs',
                                 proc_idx, env_idx + 1, len(envs))

                    self.report_queue.put(
                        dict(proc_idx=proc_idx, finished_reset=True))

                self.start_event.wait()

                with timing.timeit('work'):
                    last_report = last_report_frames = total_env_frames = 0
                    while not self.terminate.value and total_env_frames < self.cfg.sample_env_frames_per_worker:
                        for env_idx, env in enumerate(envs):
                            action = env.action_space.sample()
                            with timing.add_time(f'{env_key[env_idx]}.step'):
                                obs, reward, done, info = env.step(action)

                            num_frames = info.get('num_frames', 1)
                            total_env_frames += num_frames
                            episode_length[env_idx] += num_frames

                            if done:
                                with timing.add_time(
                                        f'{env_key[env_idx]}.reset'):
                                    env.reset()

                                episode_lengths[env_idx].append(
                                    episode_length[env_idx])
                                episode_length[env_idx] = 0

                        with timing.add_time('report'):
                            now = time.time()
                            if now - last_report > self.report_every_sec:
                                last_report = now
                                frames_since_last_report = total_env_frames - last_report_frames
                                last_report_frames = total_env_frames
                                self.report_queue.put(
                                    dict(proc_idx=proc_idx,
                                         env_frames=frames_since_last_report))

                # Extra check to make sure cpu affinity is preserved throughout the execution.
                # I observed weird effect when some environments tried to alter affinity of the current process, leading
                # to decreased performance.
                # This can be caused by some interactions between deep learning libs, OpenCV, MKL, OpenMP, etc.
                # At least user should know about it if this is happening.
                cpu_affinity = psutil.Process().cpu_affinity(
                ) if platform != 'darwin' else None
                assert initial_cpu_affinity == cpu_affinity, \
                    f'Worker CPU affinity was changed from {initial_cpu_affinity} to {cpu_affinity}!' \
                    f'This can significantly affect performance!'

            except:
                log.exception('Unknown exception')
                log.error('Unknown exception in worker %d, terminating...',
                          proc_idx)
                self.report_queue.put(dict(proc_idx=proc_idx, crash=True))

            time.sleep(proc_idx * 0.01 + 0.01)
            log.info('Process %d finished sampling. Timing: %s', proc_idx,
                     timing)

            for env_idx, env in enumerate(envs):
                if len(episode_lengths[env_idx]) > 0:
                    log.warning('Level %s avg episode len %d',
                                env_key[env_idx],
                                np.mean(episode_lengths[env_idx]))

            for env in envs:
                env.close()
コード例 #25
0
ファイル: learner.py プロジェクト: erikwijmans/sample-factory
    def _run(self):
        # workers should ignore Ctrl+C because the termination is handled in the event loop by a special msg
        signal.signal(signal.SIGINT, signal.SIG_IGN)

        try:
            psutil.Process().nice(self.cfg.default_niceness)
        except psutil.AccessDenied:
            log.error('Low niceness requires sudo!')

        if self.cfg.device == 'gpu':
            cuda_envvars(self.policy_id)

        torch.multiprocessing.set_sharing_strategy('file_system')
        torch.set_num_threads(self.cfg.learner_main_loop_num_cores)

        timing = Timing()

        rollouts = []

        if self.train_in_background:
            self.training_thread.start()
        else:
            self.initialize(timing)
            log.error(
                'train_in_background set to False on learner %d! This is slow, use only for testing!',
                self.policy_id,
            )

        while not self.terminate:
            while True:
                try:
                    tasks = self.task_queue.get_many(timeout=0.005)

                    for task_type, data in tasks:
                        if task_type == TaskType.TRAIN:
                            with timing.add_time('extract'):
                                rollouts.extend(self._extract_rollouts(data))
                                # log.debug('Learner %d has %d rollouts', self.policy_id, len(rollouts))
                        elif task_type == TaskType.INIT:
                            self._init()
                        elif task_type == TaskType.TERMINATE:
                            time.sleep(0.3)
                            log.info('GPU learner timing: %s', timing)
                            self._terminate()
                            break
                        elif task_type == TaskType.PBT:
                            self._process_pbt_task(data)
                except Empty:
                    break

            if self._accumulated_too_much_experience(rollouts):
                # if we accumulated too much experience, signal the policy workers to stop experience collection
                if not self.stop_experience_collection[self.policy_id]:
                    log.debug(
                        'Learner %d accumulated too much experience, stop experience collection!',
                        self.policy_id)
                self.stop_experience_collection[self.policy_id] = True
            elif self.stop_experience_collection[self.policy_id]:
                # otherwise, resume the experience collection if it was stopped
                self.stop_experience_collection[self.policy_id] = False
                with self.resume_experience_collection_cv:
                    log.debug('Learner %d is resuming experience collection!',
                              self.policy_id)
                    self.resume_experience_collection_cv.notify_all()

            with torch.no_grad():
                rollouts = self._process_rollouts(rollouts, timing)

            if not self.train_in_background:
                while not self.experience_buffer_queue.empty():
                    training_data = self.experience_buffer_queue.get()
                    self._process_training_data(training_data, timing)

            self._experience_collection_rate_stats()

        if self.train_in_background:
            self.experience_buffer_queue.put(None)
            self.training_thread.join()
コード例 #26
0
    def _run(self):
        # workers should ignore Ctrl+C because the termination is handled in the event loop by a special msg
        signal.signal(signal.SIGINT, signal.SIG_IGN)

        psutil.Process().nice(min(self.cfg.default_niceness + 2, 20))

        cuda_envvars(self.policy_id)
        torch.multiprocessing.set_sharing_strategy('file_system')

        timing = Timing()

        with timing.timeit('init'):
            # initialize the Torch modules
            log.info('Initializing model on the policy worker %d-%d...',
                     self.policy_id, self.worker_idx)

            torch.set_num_threads(1)

            if self.cfg.device == 'gpu':
                # we should already see only one CUDA device, because of env vars
                assert torch.cuda.device_count() == 1
                self.device = torch.device('cuda', index=0)
            else:
                self.device = torch.device('cpu')

            self.actor_critic = create_actor_critic(self.cfg, self.obs_space,
                                                    self.action_space, timing)
            self.actor_critic.model_to_device(self.device)
            for p in self.actor_critic.parameters():
                p.requires_grad = False  # we don't train anything here

            log.info('Initialized model on the policy worker %d-%d!',
                     self.policy_id, self.worker_idx)

        last_report = last_cache_cleanup = time.time()
        last_report_samples = 0
        request_count = deque(maxlen=50)

        # very conservative limit on the minimum number of requests to wait for
        # this will almost guarantee that the system will continue collecting experience
        # at max rate even when 2/3 of workers are stuck for some reason (e.g. doing a long env reset)
        # Although if your workflow involves very lengthy operations that often freeze workers, it can be beneficial
        # to set min_num_requests to 1 (at a cost of potential inefficiency, i.e. policy worker will use very small
        # batches)
        min_num_requests = self.cfg.num_workers // (
            self.cfg.num_policies * self.cfg.policy_workers_per_policy)
        min_num_requests //= 3
        min_num_requests = max(1, min_num_requests)

        # Again, very conservative timer. Only wait a little bit, then continue operation.
        wait_for_min_requests = 0.025

        while not self.terminate:
            try:
                while self.stop_experience_collection[self.policy_id]:
                    with self.resume_experience_collection_cv:
                        self.resume_experience_collection_cv.wait(timeout=0.05)

                waiting_started = time.time()
                while len(self.requests) < min_num_requests and time.time(
                ) - waiting_started < wait_for_min_requests:
                    try:
                        with timing.timeit('wait_policy'), timing.add_time(
                                'wait_policy_total'):
                            policy_requests = self.policy_queue.get_many(
                                timeout=0.005)
                        self.requests.extend(policy_requests)
                    except Empty:
                        pass

                self._update_weights(timing)

                with timing.timeit('one_step'), timing.add_time(
                        'handle_policy_step'):
                    if self.initialized:
                        if len(self.requests) > 0:
                            request_count.append(len(self.requests))
                            self._handle_policy_steps(timing)

                try:
                    task_type, data = self.task_queue.get_nowait()

                    # task from the task_queue
                    if task_type == TaskType.INIT:
                        self._init()
                    elif task_type == TaskType.TERMINATE:
                        self.terminate = True
                        break
                    elif task_type == TaskType.INIT_MODEL:
                        self._init_model(data)

                    self.task_queue.task_done()
                except Empty:
                    pass

                if time.time() - last_report > 3.0 and 'one_step' in timing:
                    timing_stats = dict(wait_policy=timing.wait_policy,
                                        step_policy=timing.one_step)
                    samples_since_last_report = self.total_num_samples - last_report_samples

                    stats = memory_stats('policy_worker', self.device)
                    if len(request_count) > 0:
                        stats['avg_request_count'] = np.mean(request_count)

                    self.report_queue.put(
                        dict(
                            timing=timing_stats,
                            samples=samples_since_last_report,
                            policy_id=self.policy_id,
                            stats=stats,
                        ))
                    last_report = time.time()
                    last_report_samples = self.total_num_samples

                if time.time() - last_cache_cleanup > 300.0 or (
                        not self.cfg.benchmark
                        and self.total_num_samples < 1000):
                    if self.cfg.device == 'gpu':
                        torch.cuda.empty_cache()
                    last_cache_cleanup = time.time()

            except KeyboardInterrupt:
                log.warning('Keyboard interrupt detected on worker %d-%d',
                            self.policy_id, self.worker_idx)
                self.terminate = True
            except:
                log.exception('Unknown exception on policy worker')
                self.terminate = True

        time.sleep(0.2)
        log.info('Policy worker avg. requests %.2f, timing: %s',
                 np.mean(request_count), timing)
コード例 #27
0
    def extract_data(self, trajectories):
        timing = Timing()

        if len(self.buffer) > self.params.distance_target_buffer_size:
            # already enough data
            return

        close, far = self.params.close_threshold, self.params.far_threshold

        num_close, num_far = 0, 0
        data_added = 0

        with timing.timeit('trajectories'):
            for trajectory in trajectories:
                check_tmax = isinstance(trajectory, TmaxTrajectory)

                obs = trajectory.obs

                indices = list(range(len(trajectory)))
                np.random.shuffle(indices)

                for i in indices:
                    if len(self.buffer
                           ) > self.params.distance_target_buffer_size // 2:
                        if data_added > self.params.distance_target_buffer_size // 4:  # to limit memory usage
                            break

                    if len(self.buffer
                           ) > self.params.distance_target_buffer_size:
                        break

                    close_i = min(i + close, len(trajectory))
                    far_i = min(i + far, len(trajectory))

                    # sample close observation pair
                    first_idx = i
                    second_idx = np.random.randint(i, close_i)

                    # in TMAX we do some additional checks
                    add_close = True
                    if check_tmax:
                        both_frames_random = trajectory.is_random[
                            first_idx] and trajectory.is_random[second_idx]
                        first_exploration = trajectory.mode[
                            first_idx] == TmaxMode.EXPLORATION
                        second_exploration = trajectory.mode[
                            second_idx] == TmaxMode.EXPLORATION
                        if both_frames_random or (first_exploration
                                                  and second_exploration):
                            add_close = True
                        else:
                            add_close = False

                    if add_close:
                        if self.params.distance_symmetric and random.random(
                        ) < 0.5:
                            first_idx, second_idx = second_idx, first_idx

                        self.buffer.add(obs_first=obs[first_idx],
                                        obs_second=obs[second_idx],
                                        labels=0)
                        data_added += 1
                        num_close += 1

                    # sample far observation pair
                    if far_i < len(trajectory):
                        first_idx = i
                        second_idx = np.random.randint(far_i, len(trajectory))

                        add_far = True
                        if check_tmax:
                            both_frames_random = trajectory.is_random[
                                first_idx] and trajectory.is_random[second_idx]
                            first_exploration = trajectory.mode[
                                first_idx] == TmaxMode.EXPLORATION
                            second_exploration = trajectory.mode[
                                second_idx] == TmaxMode.EXPLORATION
                            if both_frames_random or (first_exploration
                                                      and second_exploration):
                                add_far = True
                            else:
                                add_far = False

                        if add_far:
                            if self.params.distance_symmetric and random.random(
                            ) < 0.5:
                                first_idx, second_idx = second_idx, first_idx

                            self.buffer.add(obs_first=obs[first_idx],
                                            obs_second=obs[second_idx],
                                            labels=1)
                            data_added += 1
                            num_far += 1

        with timing.timeit('finalize'):
            self.buffer.trim_at(self.params.distance_target_buffer_size)

        if self.batch_num % 20 == 0:
            with timing.timeit('visualize'):
                self._visualize_data()

        self.batch_num += 1
        log.info('num close %d, num far %d, distance net timing %s', num_close,
                 num_far, timing)
コード例 #28
0
    def extract_data(self, trajectories):
        timing = Timing()

        if len(trajectories) <= 0:
            return

        if len(self.buffer) > self.params.locomotion_experience_replay_buffer:
            return

        with timing.timeit('trajectories'):
            max_trajectory = self.params.locomotion_max_trajectory

            data_so_far = 0

            trajectories = [
                t for t in trajectories
                if len(t) > self.params.locomotion_max_trajectory
            ]

            # train only on random frames
            random_frames = [[
                i for i, is_random in enumerate(t.is_random) if is_random
            ] for t in trajectories]

            total_experience = sum(len(frames) for frames in random_frames)
            max_total_experience = 0.75 * total_experience  # max fraction of experience to use
            max_num_segments = int(max_total_experience / max_trajectory)

            log.info(
                '%d total experience from %d trajectories (%d segments)',
                max_total_experience,
                len(trajectories),
                max_num_segments,
            )

            attempts = 0

            while data_so_far < max_total_experience:
                attempts += 1
                if attempts > 100 * max_total_experience:  # just in case
                    break

                trajectory_idx = random.choice(range(len(trajectories)))
                trajectory = trajectories[trajectory_idx]
                if len(random_frames[trajectory_idx]) <= max_trajectory:
                    continue

                first_random_frame = random_frames[trajectory_idx][0]
                if len(trajectory) - first_random_frame < max_trajectory:
                    continue

                # sample random interval in trajectory, treat the last frame as "imaginary" goal, use actions as
                # ground truth
                start_idx = random.randint(first_random_frame,
                                           len(trajectory) - 2)
                goal_idx = min(start_idx + max_trajectory, len(trajectory) - 1)
                assert start_idx < goal_idx

                if not trajectory.is_random[start_idx]:
                    continue
                if not trajectory.is_random[goal_idx]:
                    continue

                for i in range(start_idx, goal_idx):
                    if not trajectory.is_random[i]:
                        continue

                    assert 0 < goal_idx - i <= max_trajectory
                    self.buffer.add(
                        obs_prev=trajectory.obs[max(0, i - 1)],
                        obs_curr=trajectory.obs[i],
                        obs_goal=trajectory.obs[goal_idx],
                        actions=trajectory.actions[i],
                        mode=trajectory.mode[i],
                        diff=goal_idx - i,
                    )
                    data_so_far += 1

                if len(self.buffer
                       ) > self.params.locomotion_experience_replay_buffer:
                    break

        # if self.batch_num % 10 == 0:
        #     with timing.timeit('vis'):
        #         self._visualize_data(training_data)

        # with timing.timeit('finalize'):
        #     for traj_buffer in training_data:
        #         self.buffer.add_buff(traj_buffer)

        # self.shuffle_data()
        # self.buffer.trim_at(self.params.locomotion_experience_replay_buffer)

        self.batch_num += 1
        log.info('Locomotion, buffer size: %d, timing: %s', len(self.buffer),
                 timing)
コード例 #29
0
    def train(self, buffer, env_steps, agent, timing=None):
        if timing is None:
            timing = Timing()

        params = agent.params

        batch_size = params.distance_batch_size
        summary = None
        dist_step = self.step.eval(session=agent.session)

        prev_loss = 1e10
        num_epochs = params.distance_train_epochs

        log.info('Train distance net %d pairs, batch %d, epochs %d',
                 len(buffer), batch_size, num_epochs)

        with timing.timeit('dist_epochs'):
            for epoch in range(num_epochs):
                losses = []

                with timing.add_time('shuffle'):
                    buffer.shuffle_data()

                obs_first, obs_second, labels = buffer.obs_first, buffer.obs_second, buffer.labels

                with timing.add_time('batch'):
                    for i in range(0, len(obs_first) - 1, batch_size):
                        # noinspection PyProtectedMember
                        with_summaries = agent._should_write_summaries(
                            dist_step) and summary is None
                        summaries = [self.summaries] if with_summaries else []

                        start, end = i, i + batch_size

                        result = agent.session.run([self.loss, self.train_op] +
                                                   summaries,
                                                   feed_dict={
                                                       self.ph_obs_first:
                                                       obs_first[start:end],
                                                       self.ph_obs_second:
                                                       obs_second[start:end],
                                                       self.ph_labels:
                                                       labels[start:end],
                                                       self.ph_is_training:
                                                       True,
                                                   })

                        dist_step += 1
                        # noinspection PyProtectedMember
                        agent._maybe_save(dist_step, env_steps)
                        losses.append(result[0])

                        if with_summaries:
                            summary = result[-1]
                            agent.summary_writer.add_summary(
                                summary, global_step=env_steps)

                    # check loss improvement at the end of each epoch, early stop if necessary
                    avg_loss = np.mean(losses)
                    if avg_loss >= prev_loss:
                        log.info(
                            'Early stopping after %d epochs because distance net did not improve',
                            epoch + 1)
                        log.info('Was %.4f now %.4f, ratio %.3f', prev_loss,
                                 avg_loss, avg_loss / prev_loss)
                        break
                    prev_loss = avg_loss

        return dist_step
コード例 #30
0
    def sample(self, proc_idx):
        # workers should ignore Ctrl+C because the termination is handled in the event loop by a special msg
        signal.signal(signal.SIGINT, signal.SIG_IGN)

        timing = Timing()

        psutil.Process().nice(10)

        num_envs = len(DMLAB30_LEVELS_THAT_USE_LEVEL_CACHE)
        assert self.cfg.num_workers % num_envs == 0, f'should have an integer number of workers per env, e.g. {1 * num_envs}, {2 * num_envs}, etc...'
        assert self.cfg.num_envs_per_worker == 1, 'use populate_cache with 1 env per worker'

        with timing.timeit('env_init'):
            env_key = 'env'
            env_desired_num_levels = 0

            global_env_id = proc_idx * self.cfg.num_envs_per_worker
            env_config = AttrDict(worker_index=proc_idx, vector_index=0, env_id=global_env_id)
            env = create_env(self.cfg.env, cfg=self.cfg, env_config=env_config)
            env.seed(global_env_id)

            # this is to track the performance for individual DMLab levels
            if hasattr(env.unwrapped, 'level_name'):
                env_key = env.unwrapped.level_name
                env_level = env.unwrapped.level

                approx_num_episodes_per_1b_frames = DMLAB30_APPROX_NUM_EPISODES_PER_BILLION_FRAMES[
                    env_key]
                num_billions = DESIRED_TRAINING_LENGTH / int(1e9)
                num_workers_for_env = self.cfg.num_workers // num_envs
                env_desired_num_levels = int(
                    (approx_num_episodes_per_1b_frames * num_billions) / num_workers_for_env)

                env_num_levels_generated = len(dmlab_level_cache.DMLAB_GLOBAL_LEVEL_CACHE[0].
                                               all_seeds[env_level]) // num_workers_for_env

                log.warning('Worker %d (env %s) generated %d/%d levels!',
                            proc_idx,
                            env_key,
                            env_num_levels_generated,
                            env_desired_num_levels)
                time.sleep(4)

            env.reset()
            env_uses_level_cache = env.unwrapped.env_uses_level_cache

            self.report_queue.put(dict(proc_idx=proc_idx, finished_reset=True))

        self.start_event.wait()

        try:
            with timing.timeit('work'):
                last_report = last_report_frames = total_env_frames = 0
                while not self.terminate.value and total_env_frames < self.cfg.sample_env_frames_per_worker:
                    action = env.action_space.sample()
                    with timing.add_time(f'{env_key}.step'):
                        env.step(action)

                    total_env_frames += 1

                    with timing.add_time(f'{env_key}.reset'):
                        env.reset()
                        env_num_levels_generated += 1
                        log.debug('Env %s done %d/%d resets',
                                  env_key,
                                  env_num_levels_generated,
                                  env_desired_num_levels)

                    if env_num_levels_generated >= env_desired_num_levels:
                        log.debug('%s finished %d/%d resets, sleeping...',
                                  env_key,
                                  env_num_levels_generated,
                                  env_desired_num_levels)
                        time.sleep(30)  # free up CPU time for other envs

                    # if env does not use level cache, there is no need to run it
                    # let other workers proceed
                    if not env_uses_level_cache:
                        log.debug('Env %s does not require cache, sleeping...', env_key)
                        time.sleep(200)

                    with timing.add_time('report'):
                        now = time.time()
                        if now - last_report > self.report_every_sec:
                            last_report = now
                            frames_since_last_report = total_env_frames - last_report_frames
                            last_report_frames = total_env_frames
                            self.report_queue.put(
                                dict(proc_idx=proc_idx, env_frames=frames_since_last_report))

                            if get_free_disk_space_mb(self.cfg) < 3 * 1024:
                                log.error('Not enough disk space! %d',
                                          get_free_disk_space_mb(self.cfg))
                                time.sleep(200)
        except:
            log.exception('Unknown exception')
            log.error('Unknown exception in worker %d, terminating...', proc_idx)
            self.report_queue.put(dict(proc_idx=proc_idx, crash=True))

        time.sleep(proc_idx * 0.1 + 0.1)
        log.info('Process %d finished sampling. Timing: %s', proc_idx, timing)

        env.close()