Exemple #1
0
    def trainer(self, policy, fifos, shared_buffer, slopes, start_timestep,
                num_timesteps, logdir):
        proc_name = multiprocessing.current_process().name
        logger.info("Trainer %s started" % proc_name)

        # must import tensorflow here, otherwise sometimes it conflicts with multiprocessing
        from common.tensorboard_utils import create_summary_writer, add_summary
        writer = create_summary_writer(logdir)

        timestep = start_timestep
        total_episodes = 0
        total_timesteps = 0
        total_updates = 0
        total_rewards = []
        episode_rewards = []
        episode_lengths = []
        task_rewards = [[] for _ in range(len(slopes))]
        task_steps = [[] for _ in range(len(slopes))]
        task_scores = [[] for _ in range(len(slopes))]
        stats_start = time.time()
        stats_timesteps = 0
        stats_updates = 0
        while timestep < num_timesteps:
            batch_observations = []
            batch_preds = []
            batch_rewards = []
            batch_terminals = []
            batch_timesteps = 0
            mean_infos = defaultdict(list)
            queue_sizes = []

            # loop over fifos from all runners
            for q, fifo in enumerate(fifos):
                try:
                    # Queue.qsize() is not implemented on Mac, ignore as it is used only for diagnostics
                    try:
                        queue_sizes.append(fifo.qsize())
                    except NotImplementedError:
                        pass

                    # wait for a new trajectory and statistics
                    observations, preds, rewards, terminals, episode_reward, episode_length, episode_tasks, episode_steps, mean_info = \
                        fifo.get(timeout=self.args.queue_timeout)

                    #print("TRAINER EPISODE REWARDS:", episode_reward)
                    #print("TRAINER EPISODE TASKS:", episode_tasks)

                    # add to batch
                    batch_observations.append(observations)
                    batch_preds.append(preds)
                    batch_rewards.append(rewards)
                    batch_terminals.append(terminals)

                    # log statistics
                    total_rewards += episode_reward
                    episode_rewards += episode_reward
                    episode_lengths += episode_length
                    batch_timesteps += len(observations)
                    for task_id, step, reward in zip(episode_tasks,
                                                     episode_steps,
                                                     episode_reward):
                        task_rewards[task_id].append(reward)
                        task_steps[task_id].append(step)
                        task_scores[task_id].append(reward)

                    for key, val in mean_info.items():
                        mean_infos[key].append(val)

                except Empty:
                    # just ignore empty fifos, batch will be smaller
                    pass

            # estimate learning curve slope for each task
            for task_id, (scores,
                          steps) in enumerate(zip(task_scores, task_steps)):
                if len(scores) > 1:
                    #print("BEFORE TASK %d scores:" % task_id, scores)
                    #print("BEFORE TASK %d steps:" % task_id, steps)
                    # use episodes from last curriculum_steps to estimate slope
                    idx = np.where(
                        np.array(steps) > (steps[-1] -
                                           self.args.curriculum_steps))[0]
                    #print("TASK %d idx:" % task_id, idx)
                    scores = np.array(scores)
                    steps = np.array(steps)
                    # if there are less then 2 episodes then add back some episodes
                    if len(idx) == 1:
                        # add one episode before the first
                        idx = np.concatenate([[idx[0] - 1], idx])
                        print("INSERTED ONE:", idx)
                    scores = scores[idx]
                    steps = steps[idx]
                    #print("AFTER TASK %d scores:" % task_id, scores)
                    #print("AFTER TASK %d steps:" % task_id, steps)
                    slope = estimate_slope(steps, scores)
                    if self.args.curriculum_abs:
                        slope = np.abs(slope)
                    print("TASK %d slope:" % task_id, slope)
                    slopes[task_id] = slope

            # if any of the runners produced trajectories
            if len(batch_observations) > 0:
                timestep += batch_timesteps

                # reorder dimensions for preds
                batch_preds = [list(zip(*p)) for p in batch_preds]
                batch_preds = list(zip(*batch_preds))

                # train model
                policy.train(batch_observations, batch_preds, batch_rewards,
                             batch_terminals, timestep, writer)

                # share model parameters
                shared_buffer.raw = pickle.dumps(policy.get_weights(),
                                                 pickle.HIGHEST_PROTOCOL)

                total_timesteps += batch_timesteps
                total_updates += self.args.repeat_updates
                stats_timesteps += batch_timesteps
                stats_updates += self.args.repeat_updates

                for key, val in mean_infos.items():
                    add_summary(writer, "diagnostics/" + key, np.mean(val),
                                timestep)

                if timestep % self.args.stats_interval == 0:
                    total_episodes += len(episode_rewards)
                    stats_time = time.time() - stats_start
                    add_summary(writer, "game_stats/episodes",
                                len(episode_rewards), timestep)
                    add_summary(writer, "game_stats/episode_reward_mean",
                                np.mean(episode_rewards), timestep)
                    #add_summary(writer, "game_stats/episode_reward_stddev", np.std(episode_rewards), timestep)
                    add_summary(writer, "game_stats/episode_length_mean",
                                np.mean(episode_lengths), timestep)
                    #add_summary(writer, "game_stats/episode_length_stddev", np.std(episode_lengths), timestep)

                    add_summary(writer, "game_stats/total_episodes",
                                total_episodes, timestep)
                    add_summary(writer, "game_stats/total_timesteps",
                                total_timesteps, timestep)
                    add_summary(writer, "game_stats/total_updates",
                                total_updates, timestep)

                    add_summary(writer, "performance/updates_per_second",
                                stats_updates / stats_time, timestep)
                    add_summary(writer, "performance/timesteps_per_second",
                                stats_timesteps / stats_time, timestep)
                    add_summary(
                        writer, "performance/estimated_runner_fps",
                        stats_timesteps / self.args.num_runners / stats_time,
                        timestep)
                    add_summary(writer, "performance/mean_queue_length",
                                np.mean(queue_sizes), timestep)

                    for i, rewards in enumerate(task_rewards):
                        add_summary(
                            writer,
                            "curriculum_rewards/task%d_reward_mean" % i,
                            np.mean(rewards), timestep)
                        add_summary(writer,
                                    "curriculum_episodes/task%d_episodes" % i,
                                    len(rewards), timestep)

                    for i, slope in enumerate(slopes):
                        add_summary(writer,
                                    "curriculum_slopes/task%d_slope" % i,
                                    slope, timestep)

                    logger.info(
                        "Step %d/%d: episodes %d, mean episode reward %.2f, mean episode length %.2f, timesteps/sec %.2f."
                        % (timestep, num_timesteps, len(episode_rewards),
                           np.mean(episode_rewards), np.mean(episode_lengths),
                           stats_timesteps / stats_time))
                    episode_rewards = []
                    episode_lengths = []
                    task_rewards = [[] for _ in range(len(slopes))]

                    stats_start = time.time()
                    stats_timesteps = 0
                    stats_updates = 0

                if timestep % self.args.save_interval == 0:
                    policy.save_weights(
                        os.path.join(logdir, "weights_%d.hdf5" % timestep))

            #else:
            #logger.warn("Empty batch, runners are falling behind!")

        # save final weights
        policy.save_weights(os.path.join(logdir, "weights_%d.hdf5" % timestep))

        if self.args.csv_file:
            # save command-line parameters and most important performance metrics to file
            data = vars(self.args)
            data['episode_reward_mean'] = np.mean(total_rewards)
            data['total_episodes'] = total_episodes
            data['total_timesteps'] = total_timesteps
            data['total_updates'] = total_updates
            header = sorted(data.keys())

            # write the CSV file one directory above the experiment directory
            csv_file = os.path.join(os.path.dirname(logdir),
                                    self.args.csv_file)
            file_exists = os.path.isfile(csv_file)
            with open(csv_file, 'a') as file:
                writer = csv.DictWriter(file, delimiter=',', fieldnames=header)
                if not file_exists:
                    writer.writeheader()
                writer.writerow(data)

        # collect child processes
        while len(multiprocessing.active_children()) > 0:
            for fifo in fifos:
                # empty fifos just in case runners are waiting after them
                try:
                    fifo.get(timeout=1)
                except Empty:
                    pass

        logger.info("Trainer %s finished" % proc_name)
Exemple #2
0
    def trainer(self, policy, fifo, shared_buffer, start_timestep,
                num_timesteps, logdir):
        proc_name = multiprocessing.current_process().name
        logger.info("Trainer %s started" % proc_name)

        # must import tensorflow here, otherwise sometimes it conflicts with multiprocessing
        from common.tensorboard_utils import create_summary_writer, add_summary
        writer = create_summary_writer(logdir)

        timestep = start_timestep
        total_episodes = 0
        total_timesteps = 0
        total_updates = 0
        total_rewards = []
        episode_rewards = []
        episode_lengths = []
        stats_start = time.time()
        stats_timesteps = 0
        stats_updates = 0
        queue_sizes = []
        while timestep < num_timesteps:
            mean_infos = defaultdict(list)
            # Queue.qsize() is not implemented on Mac, ignore as it is used only for diagnostics
            try:
                queue_sizes.append(fifo.qsize())
            except NotImplementedError:
                pass

            # wait for a new trajectory and statistics
            batch_observations, batch_preds, batch_rewards, batch_terminals, episode_reward, episode_length, mean_info = fifo.get(
            )

            # log statistics
            total_rewards += episode_reward
            episode_rewards += episode_reward
            episode_lengths += episode_length
            batch_timesteps = np.prod(batch_observations.shape[:2])

            for key, val in mean_info.items():
                mean_infos[key].append(val)

            timestep += batch_timesteps

            # train model
            policy.train(batch_observations, batch_preds, batch_rewards,
                         batch_terminals, timestep, writer)

            # share model parameters
            shared_buffer.raw = pickle.dumps(policy.get_weights(),
                                             pickle.HIGHEST_PROTOCOL)

            total_timesteps += batch_timesteps
            total_updates += self.args.repeat_updates
            stats_timesteps += batch_timesteps
            stats_updates += self.args.repeat_updates

            for key, val in mean_infos.items():
                add_summary(writer, "diagnostics/" + key, np.mean(val),
                            timestep)

            if timestep % self.args.stats_interval == 0:
                total_episodes += len(episode_rewards)
                stats_time = time.time() - stats_start
                add_summary(writer, "game_stats/episodes",
                            len(episode_rewards), timestep)
                add_summary(writer, "game_stats/episode_reward_mean",
                            np.mean(episode_rewards), timestep)
                #add_summary(writer, "game_stats/episode_reward_stddev", np.std(episode_rewards), timestep)
                add_summary(writer, "game_stats/episode_length_mean",
                            np.mean(episode_lengths), timestep)
                #add_summary(writer, "game_stats/episode_length_stddev", np.std(episode_lengths), timestep)

                add_summary(writer, "game_stats/total_episodes",
                            total_episodes, timestep)
                add_summary(writer, "game_stats/total_timesteps",
                            int(total_timesteps), timestep)
                add_summary(writer, "game_stats/total_updates", total_updates,
                            timestep)

                add_summary(writer, "performance/updates_per_second",
                            stats_updates / stats_time, timestep)
                add_summary(writer, "performance/timesteps_per_second",
                            stats_timesteps / stats_time, timestep)
                add_summary(
                    writer, "performance/estimated_runner_fps",
                    stats_timesteps / self.args.num_runners / stats_time,
                    timestep)
                add_summary(writer, "performance/mean_queue_length",
                            np.mean(queue_sizes), timestep)

                logger.info(
                    "Step %d/%d: episodes %d, mean episode reward %.2f, mean episode length %.2f, timesteps/sec %.2f."
                    % (timestep, num_timesteps, len(episode_rewards),
                       np.mean(episode_rewards), np.mean(episode_lengths),
                       stats_timesteps / stats_time))
                episode_rewards = []
                episode_lengths = []
                stats_start = time.time()
                stats_timesteps = 0
                stats_updates = 0
                queue_sizes = []

            if timestep % self.args.save_interval == 0:
                policy.save_weights(
                    os.path.join(logdir, "weights_%d.hdf5" % timestep))

        # save final weights
        policy.save_weights(os.path.join(logdir, "weights_%d.hdf5" % timestep))

        if self.args.csv_file:
            # save command-line parameters and most important performance metrics to file
            data = vars(self.args)
            data['episode_reward_mean'] = np.mean(total_rewards)
            data['total_episodes'] = total_episodes
            data['total_timesteps'] = total_timesteps
            data['total_updates'] = total_updates
            header = sorted(data.keys())

            # write the CSV file one directory above the experiment directory
            csv_file = os.path.join(os.path.dirname(logdir),
                                    self.args.csv_file)
            file_exists = os.path.isfile(csv_file)
            with open(csv_file, 'a') as file:
                writer = csv.DictWriter(file, delimiter=',', fieldnames=header)
                if not file_exists:
                    writer.writeheader()
                writer.writerow(data)

        # collect child processes
        while len(multiprocessing.active_children()) > 0:
            # empty fifos just in case runners are waiting after them
            try:
                fifo.get(timeout=1)
            except Empty:
                pass

        logger.info("Trainer %s finished" % proc_name)