コード例 #1
0
                    type=str)
parser.add_argument('--title',
                    help='Plot title',
                    default='Learning Curve',
                    type=str)
parser.add_argument('--smooth',
                    action='store_true',
                    default=False,
                    help='Smooth Learning Curve')
args = parser.parse_args()

results = []
algos = []

for folder in args.log_dirs:
    timesteps = load_results(folder)
    results.append(timesteps)
    if folder.endswith('/'):
        folder = folder[:-1]
    algos.append(folder.split('/')[-1])

min_timesteps = np.inf

# 'walltime_hrs', 'episodes'
for plot_type in ['timesteps']:
    xy_list = []
    for result in results:
        x, y = ts2xy(result, plot_type)
        if args.smooth:
            x, y = smooth((x, y), window=50)
        n_timesteps = x[-1]
コード例 #2
0
def callback(_locals, _globals):
    """
    Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2)
    :param _locals: (dict)
    :param _globals: (dict)
    """

    global n_steps, best_mean_reward, best_eval_mean_reward, prev_dt_eval
    # Print stats every 1000 calls

    total_reward = 0
    mean_reward = 0
    if (n_steps + 1) % 1000 == 0:
        for i in range(100):
            dones = False
            timesteps = 0
            obs = test_env.reset()
            while not dones:
                action, _states = model.predict(obs)
                if model.use_action_repeat:
                    for _ in range(1):
                        obs, rewards, dones, info = test_env.step(action)
                        total_reward += rewards
                        timesteps += 1
                        if (timesteps == max_eval_timesteps):
                            dones = True
                        if (dones):
                            break
                else:
                    timesteps += 1
                    obs, rewards, dones, info = test_env.step(action)
                    total_reward += rewards
                    if (timesteps == max_eval_timesteps):
                        dones = True

                if (dones):
                    break
        mean_reward = total_reward / 100.0
        dt_reward = mean_reward - prev_dt_eval
        dt_agent.update(dt_reward / 100.0)
        print("Mean reward last dt: {}".format(dt_reward))

        prev_dt_eval = mean_reward
        new_dt = int(dt_agent.play()) + 1
        model.action_repetition = new_dt
        print("Action repetition is :{}".format(model.action_repetition))

        print("Steps: {} 100 Episode eval: {} Best eval {} ".format(
            n_steps, mean_reward, best_eval_mean_reward))
        f.write("Steps: {} 100 Episode eval: {} Best eval {}\n".format(
            n_steps, mean_reward, best_eval_mean_reward))
        if mean_reward > best_eval_mean_reward:
            best_eval_mean_reward = mean_reward
            # Example for saving best model
            print("Saving new best model")
            _locals['self'].save(log_dir + 'best_model_eval.pkl')

        log_data['dt'].append(model.action_repetition)
        log_data['eval'].append(mean_reward)
        log_data['timesteps'].append(model.num_timesteps)

        # Evaluate policy training performance

    if (n_steps + 1) % 1000 == 0:
        x, y = ts2xy(load_results(log_dir), 'timesteps')
        if len(x) > 0:
            mean_reward = np.mean(y[-100:])
            print(x[-1], 'timesteps')
            print(
                "Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}"
                .format(best_mean_reward, mean_reward))

            # New best model, you could save the agent here
            if mean_reward > best_mean_reward:
                best_mean_reward = mean_reward
                # Example for saving best model
                print("Saving new best model")
                _locals['self'].save(log_dir + 'best_model.pkl')
            log_data['train'].append(mean_reward)

    n_steps += 1
    # Returning False will stop training early
    return True
コード例 #3
0
if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('-f',
                        '--folder',
                        help='Log folder',
                        type=str,
                        default='trained_agents')
    args = parser.parse_args()

    log_dir = args.folder

    timesteps = 1e10
    window_size = 50

    W = load_results(log_dir)

    print("results: ", W)

    # save walltime to stats.csv
    df = pd.read_csv(log_dir + 'stats.csv')
    df["Train walltime (s)"] = W["t"].max()
    df.to_csv(log_dir + "stats.csv", index=False)
    print(df)

    # plot all training rewards

    results_plotter.plot_results([log_dir], timesteps,
                                 results_plotter.X_TIMESTEPS, "")
    plt.savefig(log_dir + "reward_vs_timesteps.png")
    # plt.show()
コード例 #4
0
shifted_reward = np.nan * np.ones((int(2e6 / 51), 8))
shifted_reward[int(1e6 / 51):int(1e6 / 51) + ssac_size] = rewards
fig, ax = smooth_bounded_curve(
    shifted_reward,
    time_steps=[51 * i for i in range(shifted_reward.shape[0])])

color_iter = iter(['b', 'g', 'y', 'm', 'c'])
log_dir = script_path + '../rl-baselines-zoo/baseline_log2/'
for algo in os.scandir(log_dir):
    try:
        df_list = []
        min_length = float('inf')

        for entry in os.scandir(algo.path):
            df = load_results(entry.path)

            if len(df['r']) < min_length:
                min_length = len(df['r'])

            df_list.append(df)

        min_length = int(min_length)
        rewards = np.zeros((min_length, len(df_list)))

        for i, df in enumerate(df_list):
            rewards[:, i] = np.array(df['r'][:min_length])

        print(print(algo.path), rewards[-1, :].mean(), rewards[-1, :].std())
        smooth_bounded_curve(rewards[:min_length],
                             time_steps=[51 * i for i in range(min_length)],
コード例 #5
0
        '-f', args.log_dir,
        '--algo', algo,
        '--env', env_id,
        '--no-render',
        '--seed', str(args.seed),
        '--verbose', '0',
        '--reward-log', reward_log
    ]
    if args.verbose >= 1:
        print('{}/{}'.format(idx + 1, n_experiments))
        print("Evaluating {} on {}...".format(algo, env_id))

    skip_eval = False
    if os.path.isdir(reward_log):
        try:
            x, y = ts2xy(load_results(reward_log), 'timesteps')
            skip_eval = len(x) > 0
        except:
            pass

    if skip_eval:
        print("Skipping eval...")
    else:
        return_code = subprocess.call(['python', 'enjoy.py'] + arguments)
        x, y = ts2xy(load_results(reward_log), 'timesteps')

    if len(x) > 0:
        mean_reward = np.mean(y)
        std_reward = np.std(y)
        results['algo'].append(algo)
        results['env_id'].append(env_id)
コード例 #6
0
#           'NovelGridworld-v2_8beams0filled40range3items_in_360degrees']
agents = [
    'NovelGridworld-Bow-v0_8beams0filled11hypotenuserange3items_in_360degrees'
]

# log_dir = r"C:\Users\GyanT\Documents\GitHub\Reinforcement-Learning\5_DQN\experiments\\"
# agents = ['NovelGridworld-v0_1_DQN', 'NovelGridworld-v0_2_Dueling_DQN', 'NovelGridworld-v0_3_Dueling_Double_DQN',
#           'NovelGridworld-v0_4_Double_PER_DQN', 'NovelGridworld-v0_5_Dueling_Double_PER_DQN']
# agents = ['NovelGridworld-v0_3.1_Double_PER_DQN']

plot_after_steps = 1  # 1 for all points

for agent in agents:
    print("agent: ", agent)

    x, y = ts2xy(load_results(log_dir + os.sep + agent), 'timesteps')

    print("# of Episodes: ", len(y))

    # plt.plot(x, y, label=agent)
    plt.plot(x[0::plot_after_steps],
             y[0::plot_after_steps],
             label=agent + ' (' + str(len(y)) + ' eps)')

plt.title('Learning Curve')
plt.ylabel("Episodes Rewards")
plt.xlabel("Timesteps")
plt.legend()
plt.savefig(log_dir + os.sep + "learning_curve.png",
            bbox_inches='tight',
            dpi=100)
コード例 #7
0
model = A2C("MlpPolicy", env, verbose=1).learn(int(timesteps))

#plot results
results_plotter.plot_results([log_dir], timesteps, results_plotter.X_TIMESTEPS,
                             "My title")
plt.show()

results_plotter.plot_results([log_dir], timesteps, results_plotter.X_EPISODES,
                             "My title")
plt.show()

results_plotter.plot_results([log_dir], timesteps, results_plotter.X_WALLTIME,
                             "My title")
plt.show()

print(load_results(log_dir))

#### hand-made plotting ####


def moving_average(values, window):
    """
    Smooth values by doing a moving average
    :param values: (numpy array)
    :param window: (int)
    :return: (numpy array)
    """
    weights = np.repeat(1.0, window) / window
    return np.convolve(values, weights, 'valid')

コード例 #8
0
# plot_data(pd_data, X_EPISODES, 'block_pos_dist', window=500, max_idx=20000, label="PD")
# plot_data(id_data, X_EPISODES, 'block_pos_dist', window=500, max_idx=20000 ,label="ID")
# plot_data(impedance_data, X_EPISODES, 'block_pos_dist', window=500, max_idx=20000, label="impedance")
# # plt.legend(fontsize=24)
# plt.xlabel('Episodes', fontsize=24)
# plt.ylabel('Goal Distance (m)', fontsize=24)
# plt.title('Pushing (PPO)', fontsize=30)
# plt.tight_layout()

# Load the SAC data.
# Torque
directory = os.path.join(os.environ['HOME'], 'Google Drive', 'DriveSync',
                         'DirectTorqueController_PushingEnv_SAC_0')
directory = os.path.abspath(directory)
experiment_dir = get_latest_experiment_dir(directory)
torque_data = load_results(experiment_dir)
print(experiment_dir)

# PD
directory = os.path.join(os.environ['HOME'], 'Google Drive', 'DriveSync',
                         'RelativePDController_PushingEnv_SAC_0')
directory = os.path.abspath(directory)
experiment_dir = get_latest_experiment_dir(directory)
print(experiment_dir)
pd_data = load_results(experiment_dir)

# ID
directory = os.path.join(os.environ['HOME'], 'Google Drive', 'DriveSync',
                         'RelativeInverseDynamicsController_PushingEnv_SAC_0')
directory = os.path.abspath(directory)
experiment_dir = get_latest_experiment_dir(directory)
コード例 #9
0
def callback(_locals, _globals, data_dir, freq=None, low_level_data_dir=None, checkpoint_freq=None):
    """
    Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2)
    :param _locals: (dict)
    :param _globals: (dict)
    """
    if not freq:
        freq = 100000
    global n_steps, best_mean_reward
    # Print stats every freq calls
    if (n_steps + 1) % freq == 0:
        if low_level_data_dir:
            x, y = ts2xy(load_results(data_dir), 'timesteps')
            if len(x) > 0:
                mean_reward = np.mean(y[-200:])
                print(x[-1], 'timesteps')
                print("Best 200 mean reward: {:.2f} - Last 2000 mean reward per episode: {:.2f}".format(best_mean_reward, mean_reward))

                # New best model, you could save the agent here
                if mean_reward > best_mean_reward:
                    best_mean_reward = mean_reward
                    # Example for saving best model
                    print("Saving new best model.")
                    _locals['self'].save(low_level_data_dir + '/best_model', data_dir + '/best_model')
        else:
            params = ModelParams.load(data_dir)
            env = get_env(params)
            ep_rewards = list()
            for _ in range(4):
                rewards = list()
                obs = env.reset()
                while True:
                    ac = _locals['self'].predict(obs)
                    obs, reward, done, _ = env.step(ac[0])
                    rewards.append(reward)
                    if done:
                        break
                ep_rewards.append(sum(rewards))
            
            mean_reward = sum(ep_rewards) / 100.0
            print("Best 100 mean reward: {:.2f} -  Last mean 100 Ep reward: {:.2f}".format(best_mean_reward, mean_reward))
            if mean_reward > best_mean_reward:
                best_mean_reward = mean_reward
                # Example for saving best model
                print("Saving new best model.")
                _locals['self'].save(data_dir + '/best_model')
            del env

        '''
        # Evaluate policy training performance
        x, y = ts2xy(load_results(data_dir), 'timesteps')
        if len(x) > 0:
            mean_reward = np.mean(y[-200:])
            print(x[-1], 'timesteps')
            print("Best 200 mean reward: {:.2f} - Last 2000 mean reward per episode: {:.2f}".format(best_mean_reward, mean_reward))

            # New best model, you could save the agent here
            if mean_reward > best_mean_reward:
                best_mean_reward = mean_reward
                # Example for saving best model
                print("Saving new best model")
                if low_level_data_dir:
                    _locals['self'].save(low_level_data_dir + '/best_model', data_dir + '/best_model')
                else:
                    _locals['self'].save(data_dir + '/best_model')
        '''
    if not checkpoint_freq is None and (n_steps + 1) % checkpoint_freq == 0:    
        print("Saving Model Checkpoint")
        name = "/checkpoint_" + str(n_steps + 1)
        if low_level_data_dir:
            _locals['self'].save(low_level_data_dir + name, data_dir + name)
        else:
            _locals['self'].save(data_dir + name)

    n_steps += 1
    return True
コード例 #10
0
    def eval_save_plot(self):
        eval_start_time = time.time()

        returns, lengths, scores, idle_steps, ghosts, levels, n_wins, eval_eps = evaluate_policy(self.model,
                                                                                                 self.eval_env,
                                                                                                 10,  # not used
                                                                                                 self.deterministic)

        eval_elapsed_time = get_formated_time(time.time() - eval_start_time)

        returns_columns = get_results_columns(returns)
        lengths_columns = get_results_columns(lengths)
        scores_columns = get_results_columns(scores)
        self.evals_idle_mean.append(np.mean(idle_steps))
        self.evals_idle_eps.append(np.sum(np.asarray(idle_steps) > 0))
        self.evals_ghosts_mean.append(np.mean(ghosts))
        self.evals_levels_mean.append(np.mean(levels))
        self.evals_n_wins.append(n_wins)
        self.evals_eps.append(eval_eps)

        mean_score = scores_columns[0]

        if mean_score > self.best_mean_score:
            self.model.save(os.path.join(self.log_dir, 'best_model'))
            self.best_mean_score = mean_score
            self.best_train_step = self.num_timesteps

        self.train_steps.append(self.num_timesteps)
        self.returns_columns.append(returns_columns)
        self.lengths_columns.append(lengths_columns)
        self.scores_columns.append(scores_columns)
        self.evals_elapsed_time.append(eval_elapsed_time)

        mean_returns, std_returns, max_returns, min_returns = map(list, zip(*self.returns_columns))
        mean_lengths, std_lengths, max_lengths, min_lengths = map(list, zip(*self.lengths_columns))
        mean_scores, std_scores, max_scores, min_scores = map(list, zip(*self.scores_columns))

        train_results = load_results(self.log_dir)

        train_ghosts = train_results['ghosts'].tolist()

        self.train_ghosts.append(np.mean(train_ghosts[self.last_n_episodes:]))

        train_levels = train_results['level'].tolist()

        self.train_levels.append(np.mean(train_levels[self.last_n_episodes:]))

        self.last_n_episodes = len(train_ghosts)

        rows = zip(self.train_steps, mean_scores, std_scores, max_scores, min_scores,
                   mean_returns, std_returns, max_returns, min_returns,
                   mean_lengths, std_lengths, max_lengths, min_lengths,
                   self.evals_n_wins, self.evals_eps,
                   self.evals_idle_mean, self.evals_idle_eps,
                   self.evals_ghosts_mean, self.evals_levels_mean, self.evals_elapsed_time,
                   self.train_ghosts, self.train_levels)

        write_rows(os.path.join(self.log_dir, 'evaluations.csv'), rows, EVAL_HEADER)

        plot_error_bar(self.train_steps, mean_scores, std_scores, max_scores, min_scores,
                       'Evaluations Mean Score on {} Episodes | Best: {:.1f}'.format(
                           eval_eps, self.best_mean_score),
                       'Training Step', 'Evaluation Mean Score',
                       os.path.join(self.log_dir, 'eval_scores.png'))

        plot_error_bar(self.train_steps, mean_returns, std_returns, max_returns, min_returns,
                       'Evaluations Mean Return on {} Episodes'.format(eval_eps),
                       'Training Step', 'Evaluation Mean Return',
                       os.path.join(self.log_dir, 'eval_returns.png'))

        plot_error_bar(self.train_steps, mean_lengths, std_lengths, max_lengths, min_lengths,
                       'Evaluations Mean Length on {} Episodes'.format(eval_eps),
                       'Training Step', 'Evaluation Mean Length',
                       os.path.join(self.log_dir, 'eval_lengths.png'))

        # Train returns
        x, train_returns = ts2xy(train_results, 'timesteps')

        plot_line(x, train_returns, 'Training Episodes Return | Total Episodes: {}'.format(len(train_returns)),
                  'Training Step', 'Episode Return',
                  os.path.join(self.log_dir, 'train_returns.png'))

        moving_n = 100
        if len(train_returns) < moving_n * 2:
            moving_n = 10

        moving_returns = moving_average(train_returns, n=moving_n)

        plot_line(x[moving_n-1:], moving_returns,
                  'Training Episodes Return Moving Mean | Total Episodes: {}'.format(len(train_returns)),
                  'Training Step', '{} Last Episodes Mean Return'.format(moving_n),
                  os.path.join(self.log_dir, 'train_returns_MM.png'))

        # Train scores
        train_scores = train_results['score'].tolist()

        plot_line(x, train_scores, 'Training Episodes Score | Total Episodes: {}'.format(len(train_scores)),
                  'Training Step', 'Episode Score',
                  os.path.join(self.log_dir, 'train_scores.png'))

        moving_scores = moving_average(train_scores, n=moving_n)

        plot_line(x[moving_n-1:], moving_scores,
                  'Training Episodes Score Moving Mean | Total Episodes: {}'.format(len(train_scores)),
                  'Training Step', '{} Last Episodes Mean Score'.format(moving_n),
                  os.path.join(self.log_dir, 'train_scores_MM.png'))

        # Train ghosts
        plot_line(x, train_ghosts, 'Training Episodes N Ghosts | Total Episodes: {}'.format(len(train_ghosts)),
                  'Training Step', 'Episode N Ghosts',
                  os.path.join(self.log_dir, 'train_ghosts.png'))

        moving_ghosts = moving_average(train_ghosts, n=moving_n)

        plot_line(x[moving_n-1:], moving_ghosts,
                  'Training Episodes Ghosts Moving Mean | Total Episodes: {}'.format(len(train_ghosts)),
                  'Training Step', '{} Last Episodes Mean Ghosts'.format(moving_n),
                  os.path.join(self.log_dir, 'train_ghosts_MM.png'))

        # Train difficulty
        train_diff = train_results['d'].tolist()

        plot_line(x, train_diff, 'Training Episodes Difficulty | Total Episodes: {}'.format(len(train_diff)),
                  'Training Step', 'Episode Difficulty',
                  os.path.join(self.log_dir, 'train_diff.png'))
コード例 #11
0
    for path in Path(log_dir).rglob(env_id + '_*'):
        res_file_list.append(path)

    res_file_list = sorted(res_file_list)
    # print(res_file_list)

    df_list = []
    col_list = []
    count = 1

    for filename in res_file_list:
        # print(filename)
        filename = str(filename)  # convert from Posixpath to string

        W = load_results(filename)
        print(W['r'])

        df_list.append(W['r'])
        col_list.append("seed " + str(count))
        count += 1

    #     plot_results(filename, 'timesteps', "seed nb "+str(count))
    # #     plot_results(filename, 'episodes')
    # #     plot_results(filename, 'walltime_hrs')

    all_rewards = pd.concat(df_list, axis=1)
    all_rewards.columns = col_list

    all_rewards_copy = all_rewards.copy()
    all_rewards["mean_reward"] = all_rewards_copy.mean(axis=1)
コード例 #12
0
#model_created = False

counter = 0
all_x = []
all_y = []
vert_x = []

counter += 1
env = gym.make(env_name)
env = Monitor(env, log_dir, allow_early_resets=True)
model = DDPG(MlpPolicy, env, verbose=1)
start = time.time()
model.learn(total_timesteps=step_total)
model_loc = os.path.join(models_dir, 'hand')

x, y = ts2xy(load_results(log_dir), 'timesteps')

y = moving_average(y, window=50)
x = x[len(x) - len(y):]
for i in x:
    all_x.append(i)
    appended_val = x[-1]

vert_x.append(appended_val)
for i in y:
    all_y.append(i)
#os.remove(os.path.join(log_dir, "monitor.csv"))

model.save(model_loc)
env.close()
model_created = True