def main(args=None):
    if args is None:
        args = readParser()

    # Initial environment
    env = gym.make(args.env_name)

    job_name = 'MBPO_{}_{}_{}'.format(args.env_name, args.model_type,
                                      args.seed)
    writer = SummaryWriter("tensorboard/{}".format(job_name))
    writer.add_text(
        'hyperparameters', "|param|value|\n|-|-|\n%s" %
        ('\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()])))

    # Set random seed
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    env.seed(args.seed)

    # Intial agent
    agent = SAC(env.observation_space.shape[0], env.action_space, args)

    # Initial ensemble model
    state_size = np.prod(env.observation_space.shape)
    action_size = np.prod(env.action_space.shape)
    if args.model_type == 'pytorch':
        env_model = EnsembleDynamicsModel(args.num_networks,
                                          args.num_elites,
                                          state_size,
                                          action_size,
                                          args.reward_size,
                                          args.pred_hidden_size,
                                          use_decay=args.use_decay)
    else:
        env_model = construct_model(obs_dim=state_size,
                                    act_dim=action_size,
                                    hidden_dim=args.pred_hidden_size,
                                    num_networks=args.num_networks,
                                    num_elites=args.num_elites)

    # Predict environments
    predict_env = PredictEnv(env_model, args.env_name, args.model_type)

    # Initial pool for env
    env_pool = ReplayMemory(args.replay_size)
    # Initial pool for model
    rollouts_per_epoch = args.rollout_batch_size * args.epoch_length / args.model_train_freq
    model_steps_per_epoch = int(1 * rollouts_per_epoch)
    new_pool_size = args.model_retain_epochs * model_steps_per_epoch
    model_pool = ReplayMemory(new_pool_size)

    # Sampler of environment
    env_sampler = EnvSampler(env)

    train(args, env_sampler, predict_env, agent, env_pool, model_pool, writer)
Beispiel #2
0
def main():
    logging.basicConfig(filename=time.strftime("%Y%m%d-%H%M%S") + '_train.log',
                        level=logging.INFO)

    args = readParser()

    # Initial environment
    env = gym.make(args.env_name)

    # Set random seed
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    env.seed(args.seed)

    # Intial agents ensemble
    agents = []
    for _ in range(args.num_agents):
        agent = SAC(env.observation_space.shape[0], env.action_space, args)
        agents.append(agent)

    # Initial ensemble model
    state_size = np.prod(env.observation_space.shape)
    action_size = np.prod(env.action_space.shape)
    if args.model_type == 'pytorch':
        env_model = Ensemble_Model(args.num_networks, args.num_elites,
                                   state_size, action_size, args.reward_size,
                                   args.pred_hidden_size)
    else:
        env_model = construct_model(obs_dim=state_size,
                                    act_dim=action_size,
                                    hidden_dim=args.pred_hidden_size,
                                    num_networks=args.num_networks,
                                    num_elites=args.num_elites)

    # Predict environments
    predict_env = PredictEnv(env_model, args.env_name, args.model_type)

    # Initial pool for env
    env_pool = ModelReplayMemory(args.replay_size)
    # Initial pool for model
    rollouts_per_epoch = args.rollout_batch_size * args.epoch_length / args.model_train_freq
    model_steps_per_epoch = int(1 * rollouts_per_epoch)
    new_pool_size = args.model_retain_epochs * model_steps_per_epoch
    model_pool = ModelReplayMemory(new_pool_size)

    # Sampler of environment
    env_sampler = EnvSampler(env)

    train(args, env_sampler, predict_env, agents, env_pool, model_pool)
Beispiel #3
0
def main(args=None):
    if args is None:
        args = readParser()

    # Initial environment
    env = gym.make(args.env_name)

    # Set random seed
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    env.seed(args.seed)

    # Intial agent
    agent = SAC(env.observation_space.shape[0], env.action_space, args)

    # Initial ensemble model
    state_size = np.prod(env.observation_space.shape)
    action_size = np.prod(env.action_space.shape)
    if args.model_type == 'pytorch':
        env_model = EnsembleDynamicsModel(args.num_networks,
                                          args.num_elites,
                                          state_size,
                                          action_size,
                                          args.reward_size,
                                          args.pred_hidden_size,
                                          use_decay=args.use_decay)
    else:
        env_model = construct_model(obs_dim=state_size,
                                    act_dim=action_size,
                                    hidden_dim=args.pred_hidden_size,
                                    num_networks=args.num_networks,
                                    num_elites=args.num_elites)

    # Predict environments
    predict_env = PredictEnv(env_model, args.env_name, args.model_type)

    # Initial pool for env
    env_pool = ReplayMemory(args.replay_size)
    # Initial pool for model
    rollouts_per_epoch = args.rollout_batch_size * args.epoch_length / args.model_train_freq
    model_steps_per_epoch = int(1 * rollouts_per_epoch)
    new_pool_size = args.model_retain_epochs * model_steps_per_epoch
    model_pool = ReplayMemory(new_pool_size)

    # Sampler of environment
    env_sampler = EnvSampler(env)

    train(args, env_sampler, predict_env, agent, env_pool, model_pool)
Beispiel #4
0
def main(args=None):
    if args is None:
        args = readParser()

    save_model_dir = os.path.join(args.save_dir, args.env_name,
                                  'dynamics_model')
    save_policy_dir = os.path.join(args.save_dir, args.env_name,
                                   'policy_network')
    save_env_buffer_dir = os.path.join(args.save_dir, args.env_name,
                                       'env_buffer')
    save_dynamics_buffer_dir = os.path.join(args.save_dir, args.env_name,
                                            'dynamics_buffer')
    if not os.path.exists(save_model_dir):
        os.makedirs(save_model_dir)
    if not os.path.exists(save_policy_dir):
        os.makedirs(save_policy_dir)
    if not os.path.exists(save_env_buffer_dir):
        os.makedirs(save_env_buffer_dir)
    if not os.path.exists(save_dynamics_buffer_dir):
        os.makedirs(save_dynamics_buffer_dir)

    # Initial environment
    if 'Ant' in args.env_name:
        args.env_name = new_env.register_mbpo_environments()[0]
        print('Loaded TruncatedObs-version of the Ant environment: {}'.format(
            args.env_name))
    # else:
    #     env_name = args.env_name
    env = gym.make(args.env_name)

    job_name = 'MBPO_test_policy_dependent_models_{}_{}_{}'.format(
        args.env_name, args.model_type, args.seed)
    writer = SummaryWriter(
        str(os.path.join(args.save_dir, 'tensorboard', job_name)))
    writer.add_text(
        'hyperparameters', "|param|value|\n|-|-|\n%s" %
        ('\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()])))

    # Set random seed
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    env.seed(args.seed)

    # Intial agent
    agent = SAC(env.observation_space.shape[0], env.action_space, args)

    # Initial ensemble model
    state_size = np.prod(env.observation_space.shape)
    action_size = np.prod(env.action_space.shape)
    if args.model_type == 'pytorch':
        env_model = EnsembleDynamicsModel(args.num_networks,
                                          args.num_elites,
                                          state_size,
                                          action_size,
                                          args.reward_size,
                                          args.pred_hidden_size,
                                          use_decay=args.use_decay)
    else:
        env_model = construct_model(obs_dim=state_size,
                                    act_dim=action_size,
                                    hidden_dim=args.pred_hidden_size,
                                    num_networks=args.num_networks,
                                    num_elites=args.num_elites)

    # Predict environments
    predict_env = PredictEnv(env_model, args.env_name, args.model_type)

    # Initial pool for env
    env_pool = ReplayMemory(args.replay_size)
    # Initial pool for model
    rollouts_per_epoch = args.rollout_batch_size * args.epoch_length / args.model_train_freq
    model_steps_per_epoch = int(1 * rollouts_per_epoch)
    new_pool_size = args.model_retain_epochs * model_steps_per_epoch
    model_pool = ReplayMemory(new_pool_size)

    # Sampler of environment
    env_sampler = EnvSampler(env)

    train(args, env_sampler, predict_env, agent, env_pool, model_pool, writer,
          save_model_dir, save_policy_dir, save_env_buffer_dir,
          save_dynamics_buffer_dir)

    print('Training complete!')
    print(
        '---------------------------------------------------------------------'
    )
    print(
        'Start evaluating different policies at different model checkpoints...'
    )
    print(
        '---------------------------------------------------------------------'
    )
    test_policy_dependent_models(args, env, state_size, action_size,
                                 args.save_model_freq,
                                 args.save_model_freq * 6, save_model_dir,
                                 save_policy_dir)
Beispiel #5
0
def test_policy_dependent_models(args, env, state_size, action_size,
                                 start_eval, end_eval, save_model_dir,
                                 save_policy_dir):
    save_freq = args.save_model_freq
    checkpoint_epochs = np.arange(start_eval, end_eval, save_freq)
    # checkpoint_epochs = np.arange(20, 40, 2)
    # checkpoint_epochs = [20, 26, 32, 38]
    # checkpoint_epochs = np.append(checkpoint_epochs, args.num_epoch-1)
    model_policy_return_dict = {}
    state_error_dict = {}
    reward_error_dict = {}
    with open(
            os.path.join(
                args.save_dir,
                'scaler_mu_std_{}.pkl'.format(str(int(args.num_epoch - 1)))),
            'rb') as f:
        mean, std = pickle.load(f)
    for model_epoch in checkpoint_epochs:
        dynamics_model_checkpoint = torch.load(
            str(
                os.path.join(
                    save_model_dir,
                    'EnsembleDynamicsModel_{}.pt'.format(model_epoch))))
        env_model = EnsembleDynamicsModel(args.num_networks,
                                          args.num_elites,
                                          state_size,
                                          action_size,
                                          args.reward_size,
                                          args.pred_hidden_size,
                                          use_decay=args.use_decay)
        env_model.ensemble_model.load_state_dict(
            dynamics_model_checkpoint['dynamics_model_state_dict'])
        env_model.scaler.mu = mean
        env_model.scaler.std = std
        print('dynamics_model_{} loaded'.format(model_epoch))
        predict_env = PredictEnv(env_model, args.env_name, args.model_type)
        predict_env_sampler = Predict_EnvSample(env, predict_env)
        for policy_epoch in checkpoint_epochs:
            policy_network_checkpoint = torch.load(
                str(
                    os.path.join(save_policy_dir,
                                 'PolicyNetwork_{}.pt'.format(model_epoch))))
            agent = SAC(env.observation_space.shape[0], env.action_space, args)
            agent.policy.load_state_dict(
                policy_network_checkpoint['policy_model_state_dict'])
            avg_episode_reward = []
            for i in range(args.num_eval_episode):
                predict_env_sampler.current_state = None
                sum_reward = 0
                done = False
                counter = 0
                state_error = []
                reward_error = []
                while not done and counter < args.epoch_length:
                    cur_state, action, next_state, reward, done, info, model_error = predict_env_sampler.sample(
                        agent, eval_t=True, ret_true_reward=False)
                    sum_reward += reward
                    counter += 1
                    state_error.append(model_error[0])
                    reward_error.append(model_error[1])
                # logging.info('Policy epoch{} | DynamicsModel epoch{} | number of steps: {} | inner eval num: {} | sum reward: {} | model_error: {}'.format(policy_epoch, model_epoch, counter, i, sum_reward, np.sum(model_error_list)))
                avg_episode_reward.append(sum_reward)
                # writer.add_scalar('returns/mean_eval_return_model_{}_policy_{}'.format(model_epoch, policy_epoch), sum_reward, i)
            mean_episode_reward = torch.mean(
                torch.tensor(avg_episode_reward) * 1.)
            std_episode_reward = torch.std(
                torch.tensor(avg_episode_reward) * 1.)
            model_policy_return_dict['model_{}_policy_{}'.format(
                model_epoch, policy_epoch)] = [
                    mean_episode_reward.item(),
                    std_episode_reward.item()
                ]
            state_error_dict['model_{}_policy_{}'.format(
                model_epoch, policy_epoch)] = state_error
            reward_error_dict['model_{}_policy_{}'.format(
                model_epoch, policy_epoch)] = reward_error
            print(
                'model epoch: {} | policy epoch: {} | mean return: {:.3f} | state error: {:.2f} | reward error: {:.2f} | total steps: {} | Done'
                .format(model_epoch, policy_epoch, mean_episode_reward,
                        np.mean(state_error), np.mean(reward_error), counter))
    with open(
            str(
                os.path.join(
                    args.save_dir, args.env_name,
                    'model_policy_return_dict_{}_{}_{}'.format(
                        start_eval, save_freq, end_eval))), 'w') as f:
        json.dump(model_policy_return_dict, f)
    with open(
            str(
                os.path.join(
                    args.save_dir, args.env_name,
                    'state_error_dict_{}_{}_{}.json'.format(
                        start_eval, save_freq, end_eval))), 'w') as f:
        json.dump(state_error_dict, f)
    with open(
            str(
                os.path.join(
                    args.save_dir, args.env_name,
                    'reward_error_dict_{}_{}_{}.json'.format(
                        start_eval, save_freq, end_eval))), 'w') as f:
        json.dump(
            {
                k: np.array(v).astype(np.float64).tolist()
                for k, v in reward_error_dict.items()
            }, f)
    f.close()
Beispiel #6
0
def main(args=None):
    if args is None:
        args = readParser()

    # if not os.path.exists(args.save_model_path):
    #     os.makedirs(args.save_model_path)
    # if not os.path.exists(args.save_policy_path):
    #     os.makedirs(args.save_policy_path)

    # Initial environment
    env = gym.make(args.env_name)

    # job_name = 'MBPO_test_policy_dependent_models_{}_{}_{}'.format(args.env_name, args.model_type, args.seed)
    # writer = SummaryWriter("test_policy_dependent_results_2/tensorboard/{}".format(job_name))
    # writer.add_text('hyperparameters', "|param|value|\n|-|-|\n%s" % (
    #     '\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()])))

    # Set random seed
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    env.seed(args.seed)

    # Intial agent
    agent = SAC(env.observation_space.shape[0], env.action_space, args)
    policy_network_checkpoint = torch.load(
        './test_policy_dependent_results_2/policy/PolicyNetwork_20.pt')
    agent.policy.load_state_dict(
        policy_network_checkpoint['policy_model_state_dict'])

    # Initial ensemble model
    state_size = np.prod(env.observation_space.shape)
    action_size = np.prod(env.action_space.shape)
    if args.model_type == 'pytorch':
        env_model = EnsembleDynamicsModel(args.num_networks,
                                          args.num_elites,
                                          state_size,
                                          action_size,
                                          args.reward_size,
                                          args.pred_hidden_size,
                                          use_decay=args.use_decay)
    else:
        env_model = construct_model(obs_dim=state_size,
                                    act_dim=action_size,
                                    hidden_dim=args.pred_hidden_size,
                                    num_networks=args.num_networks,
                                    num_elites=args.num_elites)
    dynamics_model_checkpoint = torch.load(
        './test_policy_dependent_results_2/dynamics_model/EnsembleDynamicsModel_20.pt'
    )
    env_model.ensemble_model.load_state_dict(
        dynamics_model_checkpoint['dynamics_model_state_dict'])

    # Predict environments
    predict_env = PredictEnv(env_model, args.env_name, args.model_type)

    # Initial pool for env
    env_pool = ReplayMemory(args.replay_size)
    env_pool.load(
        './test_policy_dependent_results_2/env_buffer/env_buffer_20.pkl')
    env_pool.position = len(env_pool.buffer)
    # env_pool.buffer = np.array(env_pool.buffer)[~np.where(np.array(env_pool.buffer)==None)[0]]
    # Initial pool for model
    rollouts_per_epoch = args.rollout_batch_size * args.epoch_length / args.model_train_freq
    model_steps_per_epoch = int(1 * rollouts_per_epoch)
    new_pool_size = args.model_retain_epochs * model_steps_per_epoch
    model_pool = ReplayMemory(new_pool_size)
    model_pool.load(
        './test_policy_dependent_results_2/model_buffer/model_buffer_20.pkl')
    model_pool.position = len(model_pool.buffer)
    # model_pool.buffer = np.array(model_pool.buffer)[~np.where(np.array(model_pool.buffer)==None)[0]]

    # Sampler of environment
    env_sampler = EnvSampler(env)

    train(args, env_sampler, predict_env, agent, env_pool, model_pool)