def main():

    # create Environment
    env = iCubPushGymEnv(urdfRoot=robot_data.getDataPath(), renders=False, useIK=1,
                        isDiscrete=0, rnd_obj_pose=0, maxSteps=2000, reward_type=0)

    # set seed
    seed = 1
    tf.reset_default_graph()
    set_global_seed(seed)
    env.seed(seed)

    # set log
    monitor_dir = os.path.join(log_dir,'log')
    os.makedirs(monitor_dir, exist_ok=True)
    env = Monitor(env, monitor_dir+'/', allow_early_resets=True)

    # create agent model
    nb_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(0.5373) * np.ones(nb_actions))

    model = DDPG('LnMlpPolicy', env, action_noise=action_noise, gamma=0.99, batch_size=16,
                normalize_observations=True,normalize_returns=False, memory_limit=100000,
                verbose=1, tensorboard_log=os.path.join(log_dir,'tb'),full_tensorboard_log=False)

    #start learning
    model.learn(total_timesteps=500000, seed=seed, callback=callback)

    # save model
    print("Saving model.pkl to ",log_dir)
    act.save(log_dir+"/final_model.pkl")
Esempio n. 2
0
def td3(env_id,
        timesteps,
        policy="MlpPolicy",
        log_interval=None,
        tensorboard_log=None,
        seed=None,
        load_weights=None):
    from stable_baselines.ddpg.noise import NormalActionNoise
    env = gym.make(env_id)

    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=0.1 * np.ones(n_actions))

    if load_weights is not None:
        model = TD3.load(load_weights, env, verbose=0)
    else:
        model = TD3(policy,
                    env,
                    action_noise=action_noise,
                    verbose=1,
                    tensorboard_log=tensorboard_log)

    callback = WandbRenderEnvCallback(model_name="td3", env_name=env_id)

    model.learn(total_timesteps=timesteps,
                log_interval=log_interval,
                callback=callback)
Esempio n. 3
0
def run_experiment(verbose, tensorboard_log, learning_rate):
    pdb.set_trace()
    env = make_vec_env(
        'PointMassDense-%d-v1' % num_objs,
        1,
        wrapper_class=FlattenDictWrapper,
        wrapper_env_kwargs=['observation', 'achieved_goal', 'desired_goal'])
    env = VecVideoRecorder(
        env,
        osp.join(logger, "videos"),
        record_video_trigger=lambda x: x % save_video_interval == 0,
        video_length=save_video_length)

    n_actions = env.action_space.shape[-1]
    stddev = 0.2
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=0.1 * np.ones(n_actions))

    model = SAC(
        MlpPolicy,
        env,
        verbose=verbose,
        tensorboard_log=logger,
        learning_rate=learning_rate,
        action_noise=action_noise,
    )
    model.learn(total_timesteps=int(nIter), log_interval=100)
    model.save(expDir + "/%s/%s_%s" %
               (name, np.format_float_scientific(nIter),
                np.format_float_scientific(learning_rate)))
    env.close()
Esempio n. 4
0
def run_stable(num_steps, save_dir):
    env = make_vec_env(BBall3Env,
                       n_envs=1,
                       monitor_dir=save_dir,
                       env_kwargs=env_config)
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=0.5 * np.ones(n_actions))

    model = TD3(
        MlpPolicy,
        env,
        action_noise=action_noise,
        verbose=1,
        gamma=0.99,
        buffer_size=1000000,
        learning_starts=10000,
        batch_size=100,
        learning_rate=1e-3,
        train_freq=1000,
        gradient_steps=1000,
        policy_kwargs={"layers": [64, 64]},
        n_cpu_tf_sess=1,
    )

    num_epochs = 1
    total_steps = 5e5

    for epoch in range(num_epochs):
        model.learn(total_timesteps=int(total_steps / num_epochs))
        model.save(save_dir + "/model.zip")
Esempio n. 5
0
def get_TD3_model(model_settings, model_path, ckpt_path, ckpt_step, tb_path):
    policy_kwargs = dict(layers=model_settings['NET_LAYERS'])
    env = get_single_process_env(model_settings, model_path, ckpt_step)
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=0.1 * np.ones(n_actions))
    if ckpt_path is not None:
        print("Loading model from checkpoint '{}'".format(ckpt_path))
        model = TD3.load(ckpt_path,
                         env=env,
                         _init_setup_model=True,
                         policy_kwargs=policy_kwargs,
                         **model_settings['train_configs'],
                         action_noise=action_noise,
                         verbose=1,
                         tensorboard_log=tb_path)
        model.num_timesteps = ckpt_step
    else:
        model = TD3(TD3MlpPolicy,
                    env,
                    _init_setup_model=True,
                    policy_kwargs=policy_kwargs,
                    action_noise=action_noise,
                    **model_settings['train_configs'],
                    verbose=1,
                    tensorboard_log=tb_path)

    return model, env
Esempio n. 6
0
def create_action_noise(env, noise_type):
    action_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mean=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
            # action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mean=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))
    return action_noise
Esempio n. 7
0
def parse_noise_types(noise_type, nb_actions):
    """
    Parse noise types for policies
    """
    action_noise = None
    param_noise = None
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mean=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mean=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))
    return action_noise, param_noise
Esempio n. 8
0
def train_TD3(env, out_dir, seed=None, **kwargs):

    # Logs will be saved in log_dir/monitor.csv
    global output_dir,log_dir
    output_dir = out_dir
    log_dir = os.path.join(out_dir, 'log')
    os.makedirs(log_dir, exist_ok=True)
    env = Monitor(env, log_dir+'/', allow_early_resets=True)

    policy = kwargs['policy']
    n_timesteps = kwargs['n_timesteps']
    noise_type = kwargs['noise_type']
    del kwargs['policy']
    del kwargs['n_timesteps']
    del kwargs['noise_type']

    ''' Parameter space noise:
    injects randomness directly into the parameters of the agent, altering the types of decisions it makes
    such that they always fully depend on what the agent currently senses. '''

    # the noise objects for TD3
    nb_actions = env.action_space.shape[-1]
    action_noise = None
    if not noise_type is None:

        for current_noise_type in noise_type.split(','):

            current_noise_type = current_noise_type.strip()

            if 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))

            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))

            else:
                raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

    if 'continue' in kwargs and kwargs['continue'] is True:
        # Continue training
        print("Loading pretrained agent")
        # Policy should not be changed
        del kwargs['policy']
        model = TD3.load(os.path.join(out_dir,'final_model.pkl'), env=env,
                         tensorboard_log=os.path.join(log_dir,'tb'), verbose=1, **kwargs)
    else:
        if 'continue' in kwargs:
            del kwargs['continue']
        model = TD3(policy, env, action_noise=action_noise, seed=seed,
                verbose=1, tensorboard_log=os.path.join(log_dir,'tb'),full_tensorboard_log=False, **kwargs)

    model.learn(total_timesteps=n_timesteps, callback=log_callback)

    return model
Esempio n. 9
0
def train():
    set_gpu()
    expDir = '/home/shivanik/lab/pointExp/state/'
    num_objs = 1

    verbose = 1
    name = 'sac_%d_0.5' % num_objs
    nIter = 1e8

    save_video_length = 200
    save_video_interval = 1000000
    file = open('sac_done.txt', 'w+')
    env = make_vec_env(
        'PointMassDense-%d-v1' % num_objs,
        1,
        wrapper_class=FlattenDictWrapper,
        wrapper_env_kwargs=['observation', 'achieved_goal', 'desired_goal'])
    n_actions = env.action_space.shape[-1]
    stddev = 0.2

    pool = multiprocessing.Pool(processes=4)
    for lr in [1e-5]:  #, 5e-4, 1e-5
        logger = osp.join(
            expDir, name, 'logs%s_%s' % (np.format_float_scientific(nIter),
                                         np.format_float_scientific(lr)))
        env = VecVideoRecorder(
            env,
            osp.join(logger, "videos"),
            record_video_trigger=lambda x: x % save_video_interval == 0,
            video_length=save_video_length)
        action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                         sigma=0.1 * np.ones(n_actions))

        # boo = pool.apply_async(func_run, args=(env, logger, lr, action_noise, file))
        model = SAC(
            MlpPolicy,
            env,
            verbose=verbose,
            tensorboard_log=logger,
            learning_rate=lr,
            action_noise=action_noise,
        )
        model.learn(total_timesteps=int(nIter), log_interval=100)
        exp_name = expDir + "/%s/%s_%s" % (name,
                                           np.format_float_scientific(nIter),
                                           np.format_float_scientific(lr))
        model.save(exp_name)
        file.write(exp_name + '\n')
        env.close()
    file.close()
    pool.close()
    pool.join()
Esempio n. 10
0
    def run(self):
        self._init()

        env = self.env
        model = self.model
        objective = self.objective

        if objective == "infogain":
            wenv = InfogainEnv(env, model)
        elif objective == "prederr":
            wenv = PrederrEnv(env, model)
        else:
            raise AttributeError(
                "Objective '{}' is unknown. Needs to be 'infogain' or 'prederr'"
                .format(objective))

        wenv.max_episode_len = self.horizon
        wenv.end_episode_callback = self._end_episode
        dvenv = DummyVecEnv([lambda: wenv])

        if self.rl_algo == "ddpg":
            self.logger.info("Setting up DDPG as model-free RL algorithm.")
            pn = AdaptiveParamNoiseSpec()
            an = NormalActionNoise(np.array([0]), np.array([1]))
            rl_model = DDPG(DDPGMlpPolicy,
                            dvenv,
                            verbose=1,
                            render=False,
                            action_noise=an,
                            param_noise=pn,
                            nb_rollout_steps=self.horizon,
                            nb_train_steps=self.horizon)
        elif self.rl_algo == "sac":
            self.logger.info("Setting up SAC as model-free RL algorithm.")
            rl_model = SAC(SACMlpPolicy,
                           dvenv,
                           verbose=1,
                           learning_starts=self.horizon)
        else:
            raise AttributeError(
                "Model-free RL algorithm '{}' is unknown.".format(
                    self.rl_algo))

        # Train the agent
        max_steps_total = self.horizon * self.n_episodes * 100
        try:
            self.logger.info("Start the agent")
            rl_model.learn(total_timesteps=max_steps_total, seed=self.seed)
        except MaxEpisodesReachedException:
            print("Exploration finished.")
Esempio n. 11
0
def td3(env_id,
        timesteps,
        policy="MlpPolicy",
        log_interval=None,
        tensorboard_log=None,
        seed=None):
    from stable_baselines.ddpg.noise import NormalActionNoise
    env = gym.make(env_id)

    # The noise objects for TD3
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=0.1 * np.ones(n_actions))

    model = TD3(policy,
                env,
                action_noise=action_noise,
                verbose=1,
                tensorboard_log=tensorboard_log)
    model.learn(total_timesteps=timesteps, log_interval=log_interval)

    save_model_weights(model, "td3", env_id, policy, seed)
def train_initial_policy(model_name,
                         algo=ALGO,
                         env_name=ENV_NAME,
                         time_steps=TIME_STEPS):
    """Uses the specified algorithm on the target environment"""
    print("Using algorithm : ", algo.__name__)
    print(
        "Model saved as : ", "data/models/" + algo.__name__ +
        "_initial_policy_" + env_name + "_.pkl")
    constrained = False

    # define the environment here
    env = gym.make(env_name)
    if NOISE_VALUE > 0: env = NoisyRealEnv(env, noise_value=NOISE_VALUE)

    if MUJOCO_NORMALIZE:
        env = MujocoNormalized(env)

    print('~~ ENV Obs RANGE : ', env.observation_space.low,
          env.observation_space.high)
    print('~~~ ENV Action RANGE : ', env.action_space.low,
          env.action_space.high)

    if TIMEWRAPPER:
        # env = TimeFeatureWrapper(env)
        env = TimeLimit(env, 1000)

    if algo.__name__ == "ACKTR":
        print('Using SubprovVecEnv')
        env = SubprocVecEnv([lambda: env for i in range(8)])
    elif algo.__name__ == "SAC":
        print('Using standard gym environment')
        env = env
    else:
        print('Using Dummy Vec Env')
        env = DummyVecEnv([lambda: env])

    if NORMALIZE:
        env = VecNormalize(
            env,
            training=True,
            norm_obs=True,
            norm_reward=False,
            clip_reward=1e6,
        )

    with open('data/target_policy_params.yaml') as file:
        args = yaml.load(file, Loader=yaml.FullLoader)
    args = args[algo.__name__][PARAMS_ENV]
    print('~~ Loaded args file ~~')

    if algo.__name__ == "SAC":
        print('Initializing SAC with RLBaselinesZoo hyperparameters .. ')
        print('using 256 node architecture as in the paper')

        class CustomPolicy(ffp_sac):
            def __init__(self, *args, **kwargs):
                super(CustomPolicy, self).__init__(*args,
                                                   **kwargs,
                                                   feature_extraction="mlp",
                                                   layers=[256, 256])

        model = SAC(
            CustomPolicy,
            env,
            verbose=1,
            tensorboard_log='data/TBlogs/initial_policy_training',
            batch_size=args['batch_size'],
            buffer_size=args['buffer_size'],
            ent_coef=args['ent_coef'],
            learning_starts=args['learning_starts'],
            learning_rate=args['learning_rate'],
            train_freq=args['train_freq'],
        )
    elif algo.__name__ == "TD3":
        print('Initializing TD3 with RLBaselinesZoo hyperparameters .. ')
        # hyperparameters suggestions from :
        # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/td3/HopperBulletEnv-v0/config.yml
        n_actions = env.action_space.shape[-1]
        action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                         sigma=float(args['noise_std']) *
                                         np.ones(n_actions))

        class CustomPolicy2(ffp_td3):
            def __init__(self, *args, **kwargs):
                super(CustomPolicy2, self).__init__(*args,
                                                    **kwargs,
                                                    feature_extraction="mlp",
                                                    layers=[400, 300])

        model = TD3(
            CustomPolicy2,
            env,
            verbose=1,
            tensorboard_log='data/TBlogs/initial_policy_training',
            batch_size=args['batch_size'],
            buffer_size=args['buffer_size'],
            gamma=args['gamma'],
            gradient_steps=args['gradient_steps'],
            learning_rate=args['learning_rate'],
            learning_starts=args['learning_starts'],
            action_noise=action_noise,
            train_freq=args['train_freq'],
        )

    elif algo.__name__ == "TRPO":
        print('Initializing TRPO with RLBaselinesZoo hyperparameters .. ')
        # hyperparameters suggestions from :
        # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/sac/HopperBulletEnv-v0/config.yml
        model = TRPO(mlp_standard,
                     env,
                     verbose=1,
                     tensorboard_log='data/TBlogs/initial_policy_training',
                     timesteps_per_batch=args['timesteps_per_batch'],
                     lam=args['lam'],
                     max_kl=args['max_kl'],
                     gamma=args['gamma'],
                     vf_iters=args['vf_iters'],
                     vf_stepsize=args['vf_stepsize'],
                     entcoeff=args['entcoeff'],
                     cg_damping=args['cg_damping'],
                     cg_iters=args['cg_iters'])

    elif algo.__name__ == "ACKTR":
        print('Initializing ACKTR')
        model = ACKTR(mlp_standard,
                      env,
                      verbose=1,
                      n_steps=128,
                      ent_coef=0.01,
                      lr_schedule='constant',
                      learning_rate=0.0217,
                      max_grad_norm=0.5,
                      gamma=0.99,
                      vf_coef=0.946)

    elif algo.__name__ == "PPO2":
        print('Initializing PPO2')
        print('Num envs : ', env.num_envs)
        model = PPO2(
            mlp_standard,
            env,
            n_steps=int(args['n_steps'] / env.num_envs),
            nminibatches=args['nminibatches'],
            lam=args['lam'],
            gamma=args['gamma'],
            ent_coef=args['ent_coef'],
            noptepochs=args['noptepochs'],
            learning_rate=args['learning_rate'],
            cliprange=args['cliprange'],
            verbose=1,
            tensorboard_log='data/TBlogs/initial_policy_training',
        )

    elif algo.__name__ == "TRPO_lagrangian":
        print(
            'Initializing TRPO-lagrangian with safety-starter-agents hyperparameters .. '
        )

        model = TRPO_lagrangian(
            MLPWithSafeValue,
            env,
            verbose=1,
            tensorboard_log='data/TBlogs/initial_policy_training',
            timesteps_per_batch=args['timesteps_per_batch'],
            lam=args['lam'],
            max_kl=args['max_kl'],
            gamma=args['gamma'],
            vf_iters=args['vf_iters'],
            vf_stepsize=args['vf_stepsize'],
            entcoeff=args['entcoeff'],
            cg_damping=args['cg_damping'],
            cg_iters=args['cg_iters'],
            cost_lim=args['cost_lim'],
            penalty_init=args['penalty_init'],
            penalty_lr=args['penalty_lr'])
        constrained = True
    else:
        print('No algorithm matched. Using SAC .. ')
        model = SAC(
            CustomPolicy,
            env,
            verbose=1,
            batch_size=args['batch_size'],
            buffer_size=args['buffer_size'],
            ent_coef=args['ent_coef'],
            learning_starts=args['learning_starts'],
            learning_rate=args['learning_rate'],
            train_freq=args['train_freq'],
        )

    # change model name if using normalization
    if NORMALIZE:
        model_name = model_name.replace('.pkl', 'normalized_.pkl')

    elif MUJOCO_NORMALIZE:
        model_name = model_name.replace('.pkl', 'mujoco_norm_.pkl')

    if SAVE_BEST_FOR_20:
        model.learn(total_timesteps=time_steps,
                    tb_log_name=model_name,
                    log_interval=10,
                    callback=eval_callback)
        save_the_model()
        model_name = model_name.replace('best_', '')
        model.save(model_name)

    else:
        model.learn(
            total_timesteps=time_steps,
            tb_log_name=model_name.split('/')[-1],
            log_interval=10,
        )
        model.save(model_name)
        evaluate_policy_on_env(env,
                               model,
                               render=False,
                               iters=10,
                               constrained=constrained)

    # save the environment params
    if NORMALIZE:
        # env.save(model_name.replace('.pkl', 'stats_.pkl'))
        env.save('data/models/env_stats/' + env_name + '.pkl')

    print('done :: ', model_name)
    exit()
def train_initial_policy(
        model_name,
        algo=ALGO,
        env_name=ENV_NAME,
        time_steps=TIME_STEPS):
    """Uses the specified algorithm on the target environment"""
    print("Using algorithm : ", algo.__name__)
    print("Model saved as : ", "data/models/" +algo.__name__+"_initial_policy_"+env_name+"_.pkl")

    # define the environment here
    env = gym.make(env_name)
    env.seed(SEED)
    if NOISE_VALUE>0 : env = NoisyRealEnv(env, noise_value=NOISE_VALUE)

    if MUJOCO_NORMALIZE:
        env = MujocoNormalized(env)

    print('~~ ENV Obs RANGE : ', env.observation_space.low, env.observation_space.high)
    print('~~~ ENV Action RANGE : ', env.action_space.low, env.action_space.high)

    if algo.__name__  == "ACKTR":
        print('Using SubprovVecEnv')
        env = SubprocVecEnv([lambda: env for i in range(8)])
    elif algo.__name__ == "SAC":
        print('Using standard gym environment')
        env = env
    else:
        print('Using Dummy Vec Env')
        env = DummyVecEnv([lambda : env])

    if NORMALIZE :
        env = VecNormalize(env,
                           training=True,
                           norm_obs=True,
                           norm_reward=False,
                           clip_reward=1e6,
                           )


    with open('data/target_policy_params.yaml') as file:
        args = yaml.load(file, Loader=yaml.FullLoader)
    args = args[algo.__name__][PARAMS_ENV]
    print('~~ Loaded args file ~~')

    if algo.__name__ == "SAC":
        print('Initializing SAC with RLBaselinesZoo hyperparameters .. ')
        print('using 256 node architecture as in the paper')

        class CustomPolicy(ffp_sac):
            def __init__(self, *args, **kwargs):
                super(CustomPolicy, self).__init__(*args, **kwargs,
                                                   feature_extraction="mlp", layers=[256, 256])

        model = SAC(CustomPolicy, env,
                    verbose=1,
                    tensorboard_log='data/TBlogs/initial_policy_training',
                    batch_size=args['batch_size'],
                    buffer_size=args['buffer_size'],
                    ent_coef=args['ent_coef'],
                    learning_starts=args['learning_starts'],
                    learning_rate=args['learning_rate'],
                    train_freq=args['train_freq'],
                    seed=SEED,
                    )
    elif algo.__name__ == "TD3":
        print('Initializing TD3 with RLBaselinesZoo hyperparameters .. ')
        # hyperparameters suggestions from :
        # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/td3/HopperBulletEnv-v0/config.yml
        n_actions = env.action_space.shape[-1]
        action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                         sigma=float(args['noise_std']) * np.ones(n_actions))
        class CustomPolicy2(ffp_td3):
            def __init__(self, *args, **kwargs):
                super(CustomPolicy2, self).__init__(*args, **kwargs,
                                                   feature_extraction="mlp", layers=[400, 300])
        model = TD3(CustomPolicy2, env,
                    verbose = 1,
                    tensorboard_log = 'data/TBlogs/initial_policy_training',
                    batch_size = args['batch_size'],
                    buffer_size = args['buffer_size'],
                    gamma = args['gamma'],
                    gradient_steps = args['gradient_steps'],
                    learning_rate = args['learning_rate'],
                    learning_starts = args['learning_starts'],
                    action_noise = action_noise,
                    train_freq=args['train_freq'],
                    seed=SEED,
                    )

    elif algo.__name__ == "TRPO":
        print('Initializing TRPO with RLBaselinesZoo hyperparameters .. ')
        # hyperparameters suggestions from :
        # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/sac/HopperBulletEnv-v0/config.yml
        model = TRPO(mlp_standard, env,
                    verbose=1,
                    tensorboard_log='data/TBlogs/initial_policy_training',
                    timesteps_per_batch=args['timesteps_per_batch'],
                    lam=args['lam'],
                    max_kl=args['max_kl'],
                    gamma=args['gamma'],
                    vf_iters=args['vf_iters'],
                    vf_stepsize=args['vf_stepsize'],
                    entcoeff=args['entcoeff'],
                    cg_damping=args['cg_damping'],
                    cg_iters=args['cg_iters'],
                     seed=SEED,
                    )

    elif algo.__name__ == "ACKTR":
        print('Initializing ACKTR')
        model = ACKTR(mlp_standard,
                      env,
                      verbose=1,
                      n_steps=128,
                      ent_coef=0.01,
                      lr_schedule='constant',
                      learning_rate=0.0217,
                      max_grad_norm=0.5,
                      gamma=0.99,
                      vf_coef=0.946,
                      seed=SEED)

    elif algo.__name__ == "PPO2":
        print('Initializing PPO2')
        print('Num envs : ', env.num_envs)
        model = PPO2(mlp_standard,
                     env,
                     n_steps=int(args['n_steps']/env.num_envs),
                     nminibatches=args['nminibatches'],
                     lam=args['lam'],
                     gamma=args['gamma'],
                     ent_coef=args['ent_coef'],
                     noptepochs=args['noptepochs'],
                     learning_rate=args['learning_rate'],
                     cliprange=args['cliprange'],
                     verbose=1,
                     tensorboard_log='data/TBlogs/initial_policy_training',
                     seed=SEED,
                     )

    else:
        print('No algorithm matched. Using SAC .. ')
        model = SAC(CustomPolicy, env,
                    verbose=1,
                    batch_size=args['batch_size'],
                    buffer_size=args['buffer_size'],
                    ent_coef=args['ent_coef'],
                    learning_starts=args['learning_starts'],
                    learning_rate=args['learning_rate'],
                    train_freq=args['train_freq'],
                    seed=SEED,
                    )

    # change model name if using normalization
    if NORMALIZE:
        model_name = model_name.replace('.pkl', 'normalized_.pkl')

    elif MUJOCO_NORMALIZE:
        model_name = model_name.replace('.pkl', 'mujoco_norm_.pkl')

    if SAVE_BEST_FOR_20:
        model.learn(total_timesteps=time_steps,
                    tb_log_name=model_name,
                    log_interval=10,
                    callback=eval_callback)
        save_the_model()
        model_name = model_name.replace('best_', '')
        model.save(model_name)
    elif SAVE_INTERMEDIATE:
        check_callback = CheckpointCallback(save_freq=SAVE_FREQ,
                                            save_path=model_name[:-4],
                                            name_prefix=ENV_NAME + '_' + str(SEED),
                                            verbose=1,
                                            )
        eval_env = DummyVecEnv([lambda: gym.make(ENV_NAME)])
        eval_env.seed(SEED)
        eval_callback = EvalCallback(eval_env,
                                     n_eval_episodes=10,
                                     eval_freq=SAVE_FREQ,
                                     log_path=model_name[:-4],
                                     deterministic=False,
                                     render=False,
                                     verbose=1)

        callbacks = CallbackList([check_callback, eval_callback])
        model.learn(total_timesteps=time_steps,
                    tb_log_name=model_name.split('/')[-1],
                    log_interval=10,
                    callback=callbacks)
        model.save(model_name)
        npzfile = np.load(model_name[:-4] + '/evaluations.npz')
        average_rewards = np.mean(npzfile['results'], axis=1)[:, 0]
        with open(model_name[:-4] + "/eval_results.txt", "a") as f:
            for i in range(np.shape(average_rewards)[0]):
                f.write("{}, {}\n".format(npzfile['timesteps'][i], average_rewards[i]))
        evaluate_policy_on_env(env, model, render=False, iters=50)
    else:
        model.learn(total_timesteps=time_steps,
                    tb_log_name=model_name.split('/')[-1],
                    log_interval=10,)
        model.save(model_name)
        evaluate_policy_on_env(env, model, render=False, iters=50)

    # save the environment params
    if NORMALIZE:
        # env.save(model_name.replace('.pkl', 'stats_.pkl'))
        env.save('data/models/env_stats/'+env_name+'.pkl')

    print('done :: ', model_name)
    exit()
Esempio n. 14
0
import matplotlib.pyplot as plt
from stable_baselines.common.env_checker import check_env

from stable_baselines.ddpg.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
from stable_baselines.td3.policies import MlpPolicy

from NormalizedActions import NormalizeActionWrapper

from LearningRocket import LearningRocket

env = LearningRocket(VISUALIZE=False)
env = NormalizeActionWrapper(env)
check_env(env, warn=True)

n_actions = env.action_space.shape[-1]
action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                 sigma=0.1 * np.ones(n_actions))

model = TD3(MlpPolicy, env, action_noise=action_noise, verbose=1)
#model = SAC('MlpPolicy', env, verbose=1)
model.load("sac_rocket")

obs = env.reset()
env.sim.VISUALIZE = True
done = False
actionList = []
obsList = []
rewardList = []
rewardSum = []
X = []
Y = []
Z = []
Esempio n. 15
0
def RocketTrainer():
    #env = SubprocVecEnv([make_env(LearningRocket, 'E:\Tobi\LearningRocket\TestHoverTD3\LearningRocketHover.py', i) for i in range(72)])

    # multiprocess environment
    env = make_vec_env(LearningRocket, n_envs=1)
    #env = LearningRocket(visualize=False)
    eval_env = make_vec_env(lambda: LearningRocket(visualize=True), n_envs=1)
    #env = VecNormalize(env)
    #eval_env = VecNormalize(eval_env)

    #env = VecNormalize.load("TestHoverTD3_env",env)
    #eval_env = VecNormalize.load("TestHoverTD3_env",eval_env)

    eval_callback = EvalCallback(eval_env,
                                 best_model_save_path='Agent007',
                                 log_path='./logs/',
                                 eval_freq=10000,
                                 deterministic=True,
                                 render=False,
                                 n_eval_episodes=1)

    #model = PPO2(MlpPolicy, env, n_steps=1500, nminibatches=144, lam=0.98, gamma=0.999, learning_rate=2.5e-4,
    #                                  noptepochs=4,ent_coef=0.01,verbose=1, tensorboard_log="./rocket_tensorboard/",
    #                                  policy_kwargs = dict(layers=[400, 300]))

    #model = PPO1(MlpPolicy, env, lam=0.98, gamma=0.999,verbose=1, tensorboard_log="./rocket_tensorboard/",
    #                                  policy_kwargs = dict(layers=[400, 300]))

    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=0.2 * np.ones(n_actions))

    model = TD3(MlpPolicy,
                env,
                action_noise=action_noise,
                batch_size=256,
                gamma=0.95,
                target_policy_noise=0.01,
                target_noise_clip=0.02,
                train_freq=10,
                gradient_steps=10,
                learning_rate=1e-3,
                learning_starts=7500,
                verbose=1,
                tensorboard_log="./rocket_tensorboard/",
                policy_kwargs=dict(layers=[400, 300]),
                buffer_size=100000)
    #model = TD3(MlpPolicy,env,verbose=1)

    start = t.time()

    #model = PPO2.load("TestHoverTD3", env=env, tensorboard_log="./rocket_tensorboard/")
    #model = TD3.load("TestHoverTD3", env=env, tensorboard_log="./rocket_tensorboard/")
    #while True:
    #model.learning_rate = 2.5e-3
    model.learn(total_timesteps=200000, callback=eval_callback)
    model.save("TestHoverTD3")
    #env.save("TestHoverTD3_env")
    del model  # remove to demonstrate saving and loading

    duration = t.time() - start

    model = TD3.load("TestHoverTD3", env=eval_env)
    #model = PPO2.load("TestHoverTD3", env=eval_env)

    # Enjoy trained agent
    obs = eval_env.reset()
    data = []
    time = []
    actions = []
    alt_reward = []
    mix_reward = []
    temp_reward = []
    valveChange = []
    speedPunishes = []
    total_reward = []
    alt_cumu = []
    mix_cumu = []
    temp_cumu = []
    total_cumu = []
    start = True
    modifiers = [1000, 1000, 200, 1, 200, 2000, 10, 1000, 1500, 1]

    for i in range(10):
        data.append([])
    for i in range(3):
        actions.append([])
    lastValves = [0.15, 0.2, 0.15]

    for i in range(600):
        action, _states = model.predict(obs, deterministic=True)
        obs, rewards, dones, info = eval_env.step(action)
        # Or_obs = eval_env.get_original_obs()

        time.append(i)
        for j in range(10):
            data[j].append(obs[0][j] * modifiers[j])
        data[2][i] -= 100
        for j in range(3):
            actions[j].append(action[0][j])
        offset = abs(data[0][i] - data[1][i])
        # if offset < 10:
        #    alt_reward.append(1-offset/10)
        # else:
        alt_reward.append((offset / 2) / 1000)

        mixError = abs(data[6][i] - 5.5)
        mix_reward.append((mixError / 0.2) / 1000)
        if mixError > 0.3:
            mix_reward[i] -= 1

        tempError = abs(data[5][i] - 900)
        temp_reward.append((tempError / 30) / 1000)
        if tempError > 50:
            temp_reward[i] -= 1

        total_reward.append(alt_reward[i] + mix_reward[i] + temp_reward[i])

        if start is True:
            alt_cumu.append(alt_reward[i])
            mix_cumu.append(mix_reward[i])
            temp_cumu.append(temp_reward[i])
            total_cumu.append(total_reward[i])
            start = False
        else:
            alt_cumu.append(alt_reward[i] + alt_cumu[i - 1])
            mix_cumu.append(mix_reward[i] + mix_cumu[i - 1])
            temp_cumu.append(temp_reward[i] + temp_cumu[i - 1])
            total_cumu.append(total_reward[i] + total_cumu[i - 1])

    plt.figure(figsize=(11, 8))
    plt.subplot(4, 2, 1)
    plt.xlabel('Time(s)')
    plt.ylabel('Offset (m)')
    plt.plot(time, data[0], label='Z Position')
    plt.plot(time, data[1], label='Z Speed')

    plt.subplot(4, 2, 2)
    plt.xlabel('Time(s)')
    plt.ylabel('Actions')

    plt.plot(time, actions[0], 'b', label='LOX Command')
    plt.plot(time, actions[1], 'r', label='LH2 Command')
    plt.plot(time, actions[2], 'y', label='Mix Command')
    plt.legend(loc='best')

    plt.subplot(4, 2, 3)
    plt.xlabel('Time(s)')
    plt.ylabel('Engine State')
    plt.plot(time, data[5], label='Temp')
    plt.legend(loc='best')

    plt.subplot(4, 2, 5)
    plt.xlabel('Time(s)')
    plt.ylabel('Engine State')
    plt.plot(time, data[4], label='Pressure')
    plt.legend(loc='best')

    plt.subplot(4, 2, 4)
    plt.xlabel('Time(s)')
    plt.ylabel('Mixture')
    plt.plot(time, data[6], label='Mixture')
    plt.legend(loc='best')

    plt.subplot(4, 2, 6)
    plt.xlabel('Time(s)')
    plt.ylabel('Reward values. Valve Error REAL valves')
    plt.plot(time, alt_reward, label='Altitude Error')
    plt.plot(time, mix_reward, label='Mixture Error')
    plt.plot(time, temp_reward, label='Temperature Error')
    plt.plot(time, total_reward, label='Total Reward')

    plt.subplot(4, 2, 8)
    plt.xlabel('Time(s)')
    plt.ylabel('Reward values cumulative')
    plt.plot(time, alt_cumu, label='Altitude Error')
    plt.plot(time, mix_cumu, label='Mixture Error')
    plt.plot(time, temp_cumu, label='Temperature Error')
    plt.plot(time, total_cumu, label='Total Reward')

    plt.legend(loc='best')

    print(duration)
    plt.show()
Esempio n. 16
0
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    """
    run the training of DDPG

    :param env_id: (str) the environment ID
    :param seed: (int) the initial random seed
    :param noise_type: (str) the wanted noises ('adaptive-param', 'normal' or 'ou'), can use multiple noise type by
        seperating them with commas
    :param layer_norm: (bool) use layer normalization
    :param evaluation: (bool) enable evaluation of DDPG training
    :param kwargs: (dict) extra keywords for the training.train function
    """

    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    env = gym.make(env_id)
    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))

    if evaluation and rank == 0:
        eval_env = gym.make(env_id)
        eval_env = bench.Monitor(eval_env,
                                 os.path.join(logger.get_dir(), 'gym_eval'))
        env = bench.Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mean=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mean=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    start_time = 0
    if rank == 0:
        start_time = time.time()
    model = DDPG(policy=MlpPolicy,
                 env=env,
                 memory_policy=Memory,
                 eval_env=eval_env,
                 param_noise=param_noise,
                 action_noise=action_noise,
                 memory_limit=int(1e6),
                 layer_norm=layer_norm,
                 verbose=2,
                 **kwargs)
    model.learn(total_timesteps=10000)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Esempio n. 17
0
    def __init__(self,
                 sim_env_name='Hopper-v2',
                 real_env_name='HopperModified-v2',
                 frames=NUM_FRAMES_INPUT,
                 num_cores=NUM_CORES,
                 num_rl_threads=NUM_RL_THREADS,
                 load_policy=None,
                 algo=None):
        self.env_name = sim_env_name
        self.real_env_name = real_env_name
        self.frames = frames
        self.num_cores = num_cores
        self.fwd_norms_x = (0., 1.)
        self.fwd_norms_y = (0., 1.)
        self.inv_norms_x = (0., 1.)
        self.inv_norms_y = (0., 1.)
        self.num_rl_threads = num_rl_threads
        self.real_env = SubprocVecEnv([
            lambda: gym.make(self.real_env_name) for i in range(self.num_cores)
        ])
        print('MODIFIED ENV BODY_MASS : ',
              gym.make(self.real_env_name).model.body_mass)
        self.sim_env = SubprocVecEnv(
            [lambda: gym.make(self.env_name) for i in range(self.num_cores)])
        print('SIMULATED ENV BODY_MASS : ',
              gym.make(self.env_name).model.body_mass)

        # lists to reuse experience from previous grounding steps
        self.fwd_model_x_list = []
        self.fwd_model_y_list = []
        self.inv_model_x_list = []
        self.inv_model_y_list = []

        # initialize target policy
        if load_policy is None:
            print('LOADING -RANDOM- INITIAL POLICY')
            self.target_policy = PPO2(MlpPolicy,
                                      env=self.sim_env,
                                      verbose=1,
                                      tensorboard_log='data/TBlogs/' +
                                      self.env_name)
        else:
            print('LOADING -PRETRAINED- INITIAL POLICY')
            # self.target_policy = SAC.load(
            #     load_policy,
            #     env=SubprocVecEnv([lambda: gym.make(self.env_name)]),
            #     tensorboard_log='data/TBlogs/'+self.env_name,
            #     verbose=1,
            #     batch_size=256,
            #     buffer_size=1000000,
            # )
            # TODO: write easy way to switch algorithms
            # self.target_policy = PPO2.load(
            #         load_policy,
            #         env=SubprocVecEnv([lambda: gym.make(self.env_name)]),
            #         tensorboard_log='TBlogs/'+self.env_name,
            #         verbose=1,
            #         n_steps=256,
            #         # buffer_size=1000000,
            #     )

            n_actions = self.sim_env.action_space.shape[-1]
            action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                             sigma=0.1 * np.ones(n_actions))
            self.target_policy = TD3.load(
                load_policy,
                env=DummyVecEnv([lambda: gym.make(self.env_name)]),
                tensorboard_log='data/TBlogs/' + self.env_name,
                verbose=1,
                batch_size=128,
                gamma=0.99,
                learning_rate=0.001,
                action_noise=action_noise,
                buffer_size=1000000,
            )

        # define the Grounded Action Transformer models here
        self._init_gat_models()
        self.grounded_sim_env = None
def main(args):
    if not os.path.exists(args.log_dir):
        os.makedirs(args.log_dir)

    if args.env == 'ant_dir':
        ant_dir_tasks = pickle.load(open(f"{args.task_path}/ant_dir_tasks", "rb"))
        env = AntDirEnv(tasks=ant_dir_tasks, include_goal=args.include_goal)
    elif args.env == 'ant_goal':
        env = AntGoalEnv(include_goal = args.include_goal)
    elif args.env == 'cheetah_dir':
        cheetah_dir_tasks = pickle.load(open(f"{args.task_path}/cheetah_dir_tasks", "rb"))
        env = HalfCheetahDirEnv(tasks = cheetah_dir_tasks, include_goal = args.include_goal)
    elif args.env == 'cheetah_vel':
        cheetah_vel_tasks = pickle.load(open(f"{args.task_path}/cheetah_vel_tasks", "rb"))
        env = HalfCheetahVelEnv(tasks = cheetah_vel_tasks, include_goal = args.include_goal)
    elif args.env == 'humanoid_dir':
        env = HumanoidDirEnv(include_goal = args.include_goal)
    elif args.env == 'walker_param':
        walker_tasks = pickle.load(open(f"{args.task_path}/walker_params_tasks", "rb"))
        env = WalkerRandParamsWrappedEnv(tasks = walker_tasks, include_goal = args.include_goal)
    elif args.env == 'ml45':
        from metaworld.benchmarks.base import Benchmark
        from metaworld.envs.mujoco.multitask_env import MultiClassMultiTaskEnv
        from metaworld.envs.mujoco.env_dict import HARD_MODE_ARGS_KWARGS, HARD_MODE_CLS_DICT

        args.type = 'train'
        if args.task is None:
            args.task = list(HARD_MODE_ARGS_KWARGS[args.type].keys())[args.task_idx]
        args_kwargs = HARD_MODE_ARGS_KWARGS[args.type][args.task]
        args_kwargs['kwargs']['obs_type'] = 'with_goal'
        args_kwargs['task'] = args.task
        env = HARD_MODE_CLS_DICT[args.type][args.task](*args_kwargs['args'], **args_kwargs['kwargs'])

    if args.env == 'ml45':
        env = TimeLimit(env, max_episode_steps = 150)
        pickle.dump(args_kwargs, open(args.log_dir + '/env_{}_{}_task{}.pkl'.format(args.env, args.type, args.task_idx), "wb" ))
    else:
        env.observation_space = gym.spaces.box.Box(env.observation_space.low, env.observation_space.high)
        env.action_space = gym.spaces.box.Box(env.action_space.low, env.action_space.high)
        env = TimeLimit(env, max_episode_steps = 200)
        pickle.dump(env.unwrapped.tasks, open(args.log_dir + '/env_{}_task{}.pkl'.format(args.env, args.task_idx), "wb" ))

    if args.alg == 'td3':
        from stable_baselines.td3.policies import MlpPolicy
        from stable_baselines.ddpg.noise import NormalActionNoise
        from src.td3 import TD3

        n_actions = env.action_space.shape[-1]
        action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))

        model = TD3(MlpPolicy, env, action_noise=action_noise, verbose=1,
                    tensorboard_log = args.log_dir + '/tensorboard/log_{}_task_{}'.format(args.env, args.task_idx),
                    buffer_log = args.log_dir + '/buffers_{}_{}_'.format(args.env, args.task_idx),
                    full_size = args.full_buffer_size,
                    buffer_size = args.replay_buffer_size,
                    batch_size = args.batch_size,
                    policy_kwargs={'layers': [400, 300]},
                    learning_rate=args.outer_policy_lr
        )
        print('###################################')
        print('###################################')
        print('## Running *TD3* data collection ##')
        print('###################################')
        print('###################################')
        model.learn(total_timesteps=args.full_buffer_size, log_interval=10)
    else:
        from stable_baselines.sac.policies import MlpPolicy
        from stable_baselines.sac.policies import FeedForwardPolicy
        from src.sac2 import SAC

        env.set_task_idx(args.task_idx)
        model = SAC(MlpPolicy,
                    env,
                    log_dir=args.log_dir,
                    verbose=1,
                    tensorboard_log = args.log_dir + '/tensorboard/log_{}_task_{}'.format(args.env, args.task_idx),
                    buffer_log = args.log_dir + '/buffers_{}_{}_'.format(args.env, args.task_idx),
                    buffer_size = args.replay_buffer_size,
                    full_size = args.full_buffer_size,
                    batch_size = args.batch_size,
                    policy_kwargs={'layers': [300,300,300]},
                    learning_rate = 3e-4,
                    gamma = 0.99)
        print('###################################')
        print('###################################')
        print('## Running *SAC* data collection ##')
        print('###################################')
        print('###################################')
        model.learn(total_timesteps = args.full_buffer_size, log_interval = 1)

    model.save(args.log_dir + '/model_{}_{}'.format(args.env, args.task_idx))
Esempio n. 19
0
File: ddpg.py Progetto: s206283/gcrl
    def train(self, args, callback, env_kwargs=None, train_kwargs=None):
        env = self.makeEnv(args, env_kwargs=env_kwargs)

        if train_kwargs is None:
            train_kwargs = {}

        # Parse noise_type
        action_noise = None
        param_noise = None
        n_actions = env.action_space.shape[-1]
        if args.noise_param:
            param_noise = AdaptiveParamNoiseSpec(initial_stddev=args.noise_param_sigma,
                                                 desired_action_stddev=args.noise_param_sigma)

        if train_kwargs.get("noise_action", args.noise_action) == 'normal':
            action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                             sigma=args.noise_action_sigma * np.ones(n_actions))
        elif train_kwargs.get("noise_action", args.noise_action) == 'ou':
            action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                        sigma=args.noise_action_sigma * np.ones(n_actions))

        # filter the hyperparam, and set default values in case no hyperparam
        train_kwargs = {k: v for k, v in train_kwargs.items() if k not in ["noise_action_sigma", "noise_action"]}

        # get the associated policy for the architecture requested
        if args.srl_model == "raw_pixels":
            args.policy = "cnn"
        else:
            args.policy = "mlp"

        self.policy = args.policy
        self.ob_space = env.observation_space
        self.ac_space = env.action_space

        policy_fn = {'cnn': CnnPolicy,
                     'mlp': MlpPolicy}[args.policy]

        param_kwargs = {
            "verbose": 1,
            "render_eval": False,
            "render": False,
            "reward_scale": 1.,
            "param_noise": param_noise,
            "normalize_returns": False,
            "normalize_observations": (args.srl_model == "raw_pixels"),
            "critic_l2_reg": 1e-2,
            "actor_lr": 1e-4,
            "critic_lr": 1e-3,
            "action_noise": action_noise,
            "enable_popart": False,
            "gamma": 0.99,
            "clip_norm": None,
            "nb_train_steps": 100,
            "nb_rollout_steps": 100,
            "nb_eval_steps": 50,
            "batch_size": args.batch_size
        }

        self.model = self.model_class(policy_fn, env, **{**param_kwargs, **train_kwargs})
        self.model.learn(total_timesteps=args.num_timesteps, seed=args.seed, callback=callback)
        env.close()
Esempio n. 20
0
def train(
        task,
        alg,
        logdir,
        domain_name,
        *,
        random_seed=None,
        num_steps=int(2e3),
        log_every=int(10e3),
        num_parallel=8,
        load_policy=False,
        load_policy_dir="",
        **kwargs
):
    """Train and evaluate an agent

    Args:
        task (str): Jitterbug task to train on
        alg (str): Algorithm to train, one of;
            - 'ddpg': DDPG Algorithm
            - 'ppo2': PPO2 Algorithm
            - 'sac': SAC Algorithm
        logdir (str): Logging directory
        domain_name (str): Name of the DMC domain

        random_seed (int): Random seed to use, or None
        num_steps (int): Number of training steps to train for
        log_every (int): Save and log progress every this many timesteps
        num_parallel (int): Number of parallel environments to run. Only used
        load_policy (bool): Whether to load an existing or not. It Yes, the policy is loaded from logdir.
            for A2C and PPO2.
    """

    assert alg in ('ddpg', 'sac', 'ppo2', 'td3'), "Invalid alg: {}".format(alg)
    assert domain_name in ('jitterbug', 'augmented_jitterbug'), "Invalid domain_name: {}".format(domain_name)

    # Cast args to types
    if random_seed is not None:
        random_seed = int(random_seed)
    else:
        random_seed = int(time.time())

    # Fix random seed
    random.seed(random_seed)
    np.random.seed(random_seed)

    # Prepare the logging directory
    os.makedirs(logdir, exist_ok=True)

    print("Training {} on {} with seed {} for {} steps "
          "(log every {}), saving to {}".format(
        alg,
        task,
        random_seed,
        num_steps,
        log_every,
        logdir
    ))

    if domain_name == "augmented_jitterbug":
        augmented_jitterbug.augment_Jitterbug(modify_legs=True,
                                              modify_mass=True,
                                              modify_coreBody1=False,
                                              modify_coreBody2=False,
                                              modify_global_density=False,
                                              modify_gear=False,
                                              )
    # Construct DMC env
    env_dmc = suite.load(
        domain_name=domain_name,
        task_name=task,
        task_kwargs=dict(random=random_seed, norm_obs=True),
        environment_kwargs=dict(flat_observation=True)
    )

    # Wrap gym env in a dummy parallel vector
    if alg in ('ppo2'):

        if num_parallel > multiprocessing.cpu_count():
            warnings.warn("Number of parallel workers "
                          "({}) > CPU count ({}), setting to # CPUs - 1".format(
                num_parallel,
                multiprocessing.cpu_count()
            ))
            num_parallel = max(
                1,
                multiprocessing.cpu_count() - 1
            )

        print("Using {} parallel environments".format(num_parallel))
        # XXX ajs 13/Sep/19 Hack to create multiple monitors that don't write to the same file
        env_vec = SubprocVecEnv([
            lambda: Monitor(
                gym.wrappers.FlattenDictWrapper(
                    jitterbug_dmc.JitterbugGymEnv(env_dmc),
                    dict_keys=["observations"]
                ),
                os.path.join(logdir, str(random.randint(0, 99999999))),
                allow_early_resets=True
            )
            for n in range(num_parallel)
        ])

    else:

        num_parallel = 1
        env_vec = DummyVecEnv([
            lambda: Monitor(
                gym.wrappers.FlattenDictWrapper(
                    jitterbug_dmc.JitterbugGymEnv(env_dmc),
                    dict_keys=["observations"]
                ),
                logdir,
                allow_early_resets=True
            )
        ])

    # Record start time
    start_time = datetime.datetime.now()

    def _cb(_locals, _globals):
        """Callback for during training"""

        if 'last_num_eps' not in _cb.__dict__:
            _cb.last_num_eps = 0

        # Extract episode reward history based on model type
        if isinstance(_locals['self'], DDPG):
            ep_r_hist = list(_locals['episode_rewards_history'])
        elif isinstance(_locals['self'], PPO2):
            ep_r_hist = [d['r'] for d in _locals['ep_info_buf']]
        elif isinstance(_locals['self'], SAC):
            ep_r_hist = [d['r'] for d in _locals['ep_info_buf']]
        elif isinstance(_locals['self'], TD3):
            ep_r_hist = [d['r'] for d in _locals['ep_info_buf']]
        else:
            raise ValueError("Invalid algorithm: {}".format(
                _locals['self']
            ))

        # Compute # elapsed steps based on # elapsed episodes
        ep_size = int(
            jitterbug_dmc.jitterbug.DEFAULT_TIME_LIMIT /
            jitterbug_dmc.jitterbug.DEFAULT_CONTROL_TIMESTEP
        )
        num_eps = len(ep_r_hist)
        elapsed_steps = ep_size * num_eps

        # Compute elapsed time in seconds
        elapsed_time = (datetime.datetime.now() - start_time).total_seconds()

        # Log some info
        if num_eps != _cb.last_num_eps:
            _cb.last_num_eps = num_eps

            print("{:.2f}s | {}ep | {}#: episode reward = "
                  "{:.2f}, last 5 episode reward = {:.2f}".format(
                elapsed_time,
                num_eps,
                elapsed_steps,
                ep_r_hist[-1],
                np.mean(ep_r_hist[-5:])
            ))

            # Save model checkpoint
            model_path = os.path.join(logdir, "model.pkl")
            print("Saved checkpoint to {}".format(model_path))
            _locals['self'].save(model_path)

        return True

    if alg == 'ddpg':

        # Default parameters for DDPG
        # kwargs.setdefault("normalize_returns", True)
        # kwargs.setdefault("return_range", (0., 1.))
        # kwargs.setdefault("normalize_observations", True)
        # kwargs.setdefault("observation_range", (-1., 1.))

        kwargs.setdefault("batch_size", 256)

        kwargs.setdefault("actor_lr", 1e-4)
        kwargs.setdefault("critic_lr", 1e-4)

        kwargs.setdefault("buffer_size", 1000000)

        kwargs.setdefault("action_noise", OrnsteinUhlenbeckActionNoise(
            mean=np.array([0.3]),
            sigma=0.3,
            theta=0.15
        ))

        print("Constructing DDPG agent with settings:")
        pprint.pprint(kwargs)

        # Construct the agent
        if load_policy:
            print("Load DDPG agent from ", load_policy_dir)
            agent = DDPG.load(load_path=os.path.join(load_policy_dir, "model.final.pkl"),
                              policy=CustomPolicyDDPG,
                              env=env_vec,
                              verbose=1,
                              tensorboard_log=logdir,
                              **kwargs
                              )
        else:
            agent = DDPG(
                policy=CustomPolicyDDPG,
                env=env_vec,
                verbose=1,
                tensorboard_log=logdir,
                **kwargs
            )

        # Train for a while (logging and saving checkpoints as we go)
        agent.learn(
            total_timesteps=num_steps,
            callback=_cb
        )

    elif alg == 'ppo2':

        kwargs.setdefault("learning_rate", 1e-4)
        kwargs.setdefault("n_steps", 256 // num_parallel)
        kwargs.setdefault("ent_coef", 0.01)
        kwargs.setdefault("cliprange", 0.1)

        print("Constructing PPO2 agent with settings:")
        pprint.pprint(kwargs)

        if load_policy:
            print("Load PPO2 agent from ", load_policy_dir)
            agent = PPO2.load(load_path=os.path.join(load_policy_dir, "model.final.pkl"),
                              policy=CustomPolicyGeneral,
                              env=env_vec,
                              verbose=1,
                              tensorboard_log=logdir,
                              **kwargs
                              )
        else:
            agent = PPO2(
                policy=CustomPolicyGeneral,
                env=env_vec,
                verbose=1,
                tensorboard_log=logdir,
                **kwargs
            )

        # Train for a while (logging and saving checkpoints as we go)
        agent.learn(
            total_timesteps=num_steps,
            callback=_cb,
            log_interval=10
        )

    elif alg == 'sac':

        # Default parameters for SAC
        kwargs.setdefault("learning_rate", 1e-4)
        kwargs.setdefault("buffer_size", 1000000)
        kwargs.setdefault("batch_size", 256)
        kwargs.setdefault("ent_coef", 'auto')
        # kwargs.setdefault("ent_coef", 'auto_0.1')

        kwargs.setdefault("action_noise", NormalActionNoise(
            mean=0,
            sigma=0.2,
        ))

        print("Constructing SAC agent with settings:")
        pprint.pprint(kwargs)

        # Construct the agent
        # XXX ajs 14/Sep/19 SAC in stable_baselines uses outdated policy
        # classes so we just use MlpPolicy and pass policy_kwargs

        if load_policy:
            print("Load SAC agent from ", load_policy_dir)
            kwargs.setdefault("policy_kwargs", dict(layers=[350, 250], act_fun=tf.nn.relu))
            agent = SAC.load(load_path=os.path.join(load_policy_dir, "model.final.pkl"),
                             env=env_vec,
                             verbose=1,
                             tensorboard_log=logdir,
                             **kwargs
                             )
        else:
            agent = SAC(
                policy='MlpPolicy',
                env=env_vec,
                verbose=1,
                tensorboard_log=logdir,
                policy_kwargs=dict(layers=[350, 250], act_fun=tf.nn.relu),
                **kwargs
            )

        # Train for a while (logging and saving checkpoints as we go)
        agent.learn(
            total_timesteps=num_steps,
            callback=_cb
        )

    elif alg == 'td3':

        # Default parameters for SAC
        kwargs.setdefault("learning_rate", 1e-4)
        kwargs.setdefault("buffer_size", 1000000)
        kwargs.setdefault("batch_size", 256)
        kwargs.setdefault("gradient_steps", 1000)
        kwargs.setdefault("learning_starts", 10000)
        kwargs.setdefault("train_freq", 1000)

        # kwargs.setdefault("ent_coef", 'auto_0.1')

        kwargs.setdefault("action_noise", NormalActionNoise(
            mean=0,
            sigma=0.2,
        ))

        print("Constructing TD3 agent with settings:")
        pprint.pprint(kwargs)

        # Construct the agent
        # XXX ajs 14/Sep/19 SAC in stable_baselines uses outdated policy
        # classes so we just use MlpPolicy and pass policy_kwargs
        if load_policy:
            print("Load TD3 agent from ", load_policy_dir)
            kwargs.setdefault("policy_kwargs", dict(layers=[350, 250], act_fun=tf.nn.relu))
            agent = TD3.load(load_path=os.path.join(load_policy_dir, "model.final.pkl"),
                             env=env_vec,
                             verbose=1,
                             tensorboard_log=logdir,
                             **kwargs
                             )
        else:
            agent = TD3(
                policy='MlpPolicy',
                env=env_vec,
                verbose=1,
                tensorboard_log=logdir,
                policy_kwargs=dict(layers=[350, 250], act_fun=tf.nn.relu),
                **kwargs
            )

        # Train for a while (logging and saving checkpoints as we go)
        agent.learn(
            total_timesteps=num_steps,
            callback=_cb
        )

    else:
        raise ValueError("Invalid alg: {}".format(alg))

    # Save final model
    agent.save(os.path.join(logdir, 'model.final.pkl'))

    print("Done")
Esempio n. 21
0
from stable_baselines.td3 import TD3, LnCnnPolicy, LnMlpPolicy
#from stable_baselines.ddpg import DDPG, LnMlpPolicy
from env import *
import tensorflow as tf
from config import config
from stable_baselines.ddpg.noise import NormalActionNoise
from stable_baselines.common.vec_env import DummyVecEnv

policy = LnMlpPolicy
action_noise = NormalActionNoise(mean=np.zeros(config['ACTION_DIM']),
                                 sigma=0.1 * np.ones(config['ACTION_DIM']))
#env = SketchDesigner(SketchDiscriminator(config['SAVED_GAN']))
env = SketchDesigner(SketchClassifier(config['SAVED_CNN']))

#env = DummyVecEnv([lambda: env])

agent = TD3(
    policy,
    env,
    random_exploration=0.2,
    #action_noise=action_noise,
    #tensorboard_log='./log/',
    verbose=1)
#agent.get_env().env_method('get_policy', agent.policy_tf)
agent.get_env().get_policy(agent.policy_tf)

for _ in range(400):
    agent.learn(1000, reset_num_timesteps=False)
    agent.save('./save/4/model')
Esempio n. 22
0
def train_SAC(env, eval_env, out_dir, seed=None, **kwargs):

    # Delete keys so the dict can be pass to the model constructor
    policy = kwargs['policy']
    n_timesteps = kwargs['n_timesteps']
    noise_type = None
    if 'noise_type' in kwargs:
        noise_type = kwargs['noise_type']
        del kwargs['noise_type']
    del kwargs['policy']
    del kwargs['n_timesteps']

    save_frequency = 10000
    eval_frequency = 50000
    eval_episodes = 1000
    if 'save_freq' in kwargs:
        save_frequency = kwargs['save_freq']
        del kwargs['save_freq']

    if 'eval_freq' in kwargs:
        eval_frequency = kwargs['eval_freq']
        del kwargs['eval_freq']

    if 'eval_episides' in kwargs:
        eval_episodes = kwargs['eval_episides']
        del kwargs['eval_episides']

    # the noise objects - usually not necessary for SAC but can help for hard exploration tasks
    nb_actions = env.action_space.shape[-1]
    action_noise = None
    if noise_type:

        for current_noise_type in noise_type.split(','):

            current_noise_type = current_noise_type.strip()

            if 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = NormalActionNoise(mean=np.zeros(nb_actions),
                                                 sigma=float(stddev) *
                                                 np.ones(nb_actions))

            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(
                    mean=np.zeros(nb_actions),
                    sigma=float(stddev) * np.ones(nb_actions))

            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))

    # Create learning rate schedule
    for key in ['learning_rate', 'learning_rate_pi', 'cliprange']:
        if key in kwargs:
            if isinstance(kwargs[key], str):
                schedule, initial_value = kwargs[key].split('_')
                initial_value = float(initial_value)
                kwargs[key] = linear_schedule(initial_value)
            elif isinstance(kwargs[key], float):
                kwargs[key] = constfn(kwargs[key])
            else:
                raise ValueError('Invalid valid for {}: {}'.format(
                    key, kwargs[key]))

    if 'continue' in kwargs and kwargs['continue'] is True:
        print("Loading pretrained agent")
        list_of_models = glob.glob(os.path.join(out_dir, '*.zip'))
        last_saved_model = max(list_of_models, key=os.path.getctime)
        model = SAC_residual.load(last_saved_model,
                                  env=env,
                                  tensorboard_log=os.path.join(out_dir, 'tb'),
                                  verbose=1,
                                  **kwargs)
        reset_num_timesteps = False
        if 'num_timesteps' in kwargs:
            model.num_timesteps = kwargs['num_timesteps']
            del kwargs['num_timesteps']
    else:
        if 'continue' in kwargs:
            del kwargs['continue']
        # create model
        model = SAC(policy,
                    env,
                    action_noise=action_noise,
                    seed=seed,
                    verbose=1,
                    tensorboard_log=os.path.join(out_dir, 'tb'),
                    full_tensorboard_log=False,
                    **kwargs)
        reset_num_timesteps = True

    # start training
    train_callback = get_train_callback(eval_env,
                                        seed,
                                        out_dir,
                                        save_f=save_frequency,
                                        eval_f=eval_frequency,
                                        eval_ep=eval_episodes)
    model.learn(total_timesteps=n_timesteps,
                callback=train_callback,
                log_interval=10,
                reset_num_timesteps=reset_num_timesteps)

    return model
def main(args):
    log_dir = args.log_path if (
        args.log_path is not None
    ) else "/tmp/stable_baselines_" + time.strftime('%Y-%m-%d-%H-%M-%S')
    if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
        rank = 0
        configure_logger(log_dir)
    else:
        rank = MPI.COMM_WORLD.Get_rank()
        configure_logger(log_dir, format_strs=[])

    set_global_seeds(args.seed)

    model_class = SAC_parallel

    n_workers = args.num_workers if not args.play else 1
    env_kwargs = get_env_kwargs(args.env,
                                random_ratio=args.random_ratio,
                                sequential=args.sequential,
                                reward_type=args.reward_type,
                                n_object=args.n_object)

    def make_thunk(rank):
        return lambda: make_env(
            env_id=args.env, rank=rank, log_dir=log_dir, kwargs=env_kwargs)

    env = ParallelSubprocVecEnv([make_thunk(i) for i in range(n_workers)],
                                reset_when_done=True)

    if os.path.exists(os.path.join(logger.get_dir(), 'eval.csv')):
        os.remove(os.path.join(logger.get_dir(), 'eval.csv'))
        print('Remove existing eval.csv')
    eval_env_kwargs = env_kwargs.copy()
    eval_env_kwargs['random_ratio'] = 0.0
    eval_env = make_env(env_id=args.env, rank=0, kwargs=eval_env_kwargs)
    eval_env = FlattenDictWrapper(
        eval_env, ['observation', 'achieved_goal', 'desired_goal'])

    if not args.play:
        os.makedirs(log_dir, exist_ok=True)

    # Available strategies (cf paper): future, final, episode, random
    goal_selection_strategy = 'future'  # equivalent to GoalSelectionStrategy.FUTURE

    if not args.play:
        from stable_baselines.ddpg.noise import NormalActionNoise
        noise_type = args.action_noise.split('_')[0]
        if noise_type == 'none':
            parsed_action_noise = None
        elif noise_type == 'normal':
            sigma = float(args.action_noise.split('_')[1])
            parsed_action_noise = NormalActionNoise(
                mean=np.zeros(env.action_space.shape),
                sigma=sigma * np.ones(env.action_space.shape))
        else:
            raise NotImplementedError
        train_kwargs = get_train_kwargs("sac", args, parsed_action_noise,
                                        eval_env)

        def callback(_locals, _globals):
            if _locals['step'] % int(1e3) == 0:
                if 'FetchStack' in args.env:
                    mean_eval_reward = stack_eval_model(
                        eval_env,
                        _locals["self"],
                        init_on_table=(args.env == 'FetchStack-v2'))
                elif 'MasspointPushDoubleObstacle-v2' in args.env:
                    mean_eval_reward = egonav_eval_model(
                        eval_env,
                        _locals["self"],
                        env_kwargs["random_ratio"],
                        fixed_goal=np.array([4., 4., 0.15, 0., 0., 0., 1.]))
                    mean_eval_reward2 = egonav_eval_model(
                        eval_env,
                        _locals["self"],
                        env_kwargs["random_ratio"],
                        goal_idx=0,
                        fixed_goal=np.array([4., 4., 0.15, 1., 0., 0., 0.]))
                    log_eval(_locals['self'].num_timesteps,
                             mean_eval_reward2,
                             file_name="eval_box.csv")
                else:
                    mean_eval_reward = eval_model(eval_env, _locals["self"])
                log_eval(_locals['self'].num_timesteps, mean_eval_reward)
            if _locals['step'] % int(2e4) == 0:
                model_path = os.path.join(
                    log_dir, 'model_' + str(_locals['step'] // int(2e4)))
                model.save(model_path)
                print('model saved to', model_path)
            return True

        class CustomSACPolicy(SACPolicy):
            def __init__(self, *model_args, **model_kwargs):
                super(CustomSACPolicy, self).__init__(
                    *model_args,
                    **model_kwargs,
                    layers=[256, 256] if 'MasspointPushDoubleObstacle'
                    in args.env else [256, 256, 256, 256],
                    feature_extraction="mlp")

        register_policy('CustomSACPolicy', CustomSACPolicy)
        from utils.sac_attention_policy import AttentionPolicy
        register_policy('AttentionPolicy', AttentionPolicy)
        policy_kwargs = get_policy_kwargs("sac", args)

        if rank == 0:
            print('train_kwargs', train_kwargs)
            print('policy_kwargs', policy_kwargs)
        # Wrap the model
        model = HER2(args.policy,
                     env,
                     model_class,
                     n_sampled_goal=4,
                     goal_selection_strategy=goal_selection_strategy,
                     num_workers=args.num_workers,
                     policy_kwargs=policy_kwargs,
                     verbose=1,
                     **train_kwargs)
        print(model.get_parameter_list())

        # Train the model
        model.learn(
            int(args.num_timesteps),
            seed=args.seed,
            callback=callback,
            log_interval=100 if not ('MasspointMaze-v3' in args.env) else 10)

        if rank == 0:
            model.save(os.path.join(log_dir, 'final'))

    # WARNING: you must pass an env
    # or wrap your environment with HERGoalEnvWrapper to use the predict method
    if args.play and rank == 0:
        assert args.load_path is not None
        model = HER2.load(args.load_path, env=env)

        fig, ax = plt.subplots(1, 1, figsize=(8, 8))
        obs = env.reset()
        if 'FetchStack' in args.env:
            env.env_method('set_task_array',
                           [[(env.get_attr('n_object')[0], 0)]])
            obs = env.reset()
            while env.get_attr('current_nobject')[0] != env.get_attr(
                    'n_object')[0] or env.get_attr('task_mode')[0] != 1:
                obs = env.reset()
        elif 'FetchPushWallObstacle' in args.env:
            while not (obs['observation'][0][4] > 0.7
                       and obs['observation'][0][4] < 0.8):
                obs = env.reset()
            env.env_method('set_goal', [np.array([1.18, 0.8, 0.425, 1, 0])])
            obs = env.env_method('get_obs')
            obs = {
                'observation': obs[0]['observation'][None],
                'achieved_goal': obs[0]['achieved_goal'][None],
                'desired_goal': obs[0]['desired_goal'][None]
            }
            # obs[0] = np.concatenate([obs[0][key] for key in ['observation', 'achieved_goal', 'desired_goal']])
        elif 'MasspointPushDoubleObstacle' in args.env or 'FetchPushWallObstacle' in args.env:
            while np.argmax(obs['desired_goal'][0][3:]) != 0:
                obs = env.reset()
        elif 'MasspointMaze-v2' in args.env:
            while obs['observation'][0][0] < 3 or obs['observation'][0][1] < 3:
                obs = env.reset()
            env.env_method('set_goal', [np.array([1., 1., 0.15])])
            obs = env.env_method('get_obs')
            obs = {
                'observation': obs[0]['observation'][None],
                'achieved_goal': obs[0]['achieved_goal'][None],
                'desired_goal': obs[0]['desired_goal'][None]
            }

        print('goal', obs['desired_goal'][0], 'obs', obs['observation'][0])
        episode_reward = 0.0
        images = []
        frame_idx = 0
        num_episode = 0
        for i in range(env_kwargs['max_episode_steps'] * 10):
            img = env.render(mode='rgb_array')
            ax.cla()
            ax.imshow(img)
            tasks = ['pick and place', 'stack']
            ax.set_title('episode ' + str(num_episode) + ', frame ' +
                         str(frame_idx) + ', task: ' +
                         tasks[np.argmax(obs['observation'][0][-2:])])
            images.append(img)
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, _ = env.step(action)
            episode_reward += reward
            frame_idx += 1
            if args.export_gif:
                plt.imsave(
                    os.path.join(os.path.dirname(args.load_path),
                                 'tempimg%d.png' % i), img)
            else:
                plt.pause(0.02)
            if done:
                print('episode_reward', episode_reward)
                obs = env.reset()
                if 'FetchStack' in args.env:
                    while env.get_attr('current_nobject')[0] != env.get_attr('n_object')[0] or \
                                    env.get_attr('task_mode')[0] != 1:
                        obs = env.reset()
                elif 'MasspointPushDoubleObstacle' in args.env or 'FetchPushWallObstacle' in args.env:
                    while np.argmax(obs['desired_goal'][0][3:]) != 0:
                        obs = env.reset()
                print('goal', obs['desired_goal'][0])
                episode_reward = 0.0
                frame_idx = 0
                num_episode += 1
                if num_episode >= 1:
                    break
        exit()
        if args.export_gif:
            os.system('ffmpeg -r 5 -start_number 0 -i ' +
                      os.path.dirname(args.load_path) +
                      '/tempimg%d.png -c:v libx264 -pix_fmt yuv420p ' +
                      os.path.join(os.path.dirname(args.load_path), args.env +
                                   '.mp4'))
            for i in range(env_kwargs['max_episode_steps'] * 10):
                # images.append(plt.imread('tempimg' + str(i) + '.png'))
                try:
                    os.remove(
                        os.path.join(os.path.dirname(args.load_path),
                                     'tempimg' + str(i) + '.png'))
                except:
                    pass
Esempio n. 24
0
def train_HER(env, out_dir, seed=None, **kwargs):
    # Logs will be saved in log_dir/monitor.csv
    global output_dir, log_dir
    output_dir = out_dir
    log_dir = os.path.join(out_dir, 'log')
    os.makedirs(log_dir, exist_ok=True)
    env = Monitor(env, log_dir + '/', allow_early_resets=True)

    policy = kwargs['policy']
    algo_name = kwargs['algo_name']
    n_timesteps = kwargs['n_timesteps']
    noise_type = None
    if 'noise_type' in kwargs:
        noise_type = kwargs['noise_type']
        del kwargs['noise_type']

    # HER Available strategies (cf paper): future, final, episode, random
    goal_selection_strategy = kwargs['goal_selection_strategy']
    n_sampled_goal = kwargs['n_sampled_goal']

    del kwargs['policy']
    del kwargs['algo_name']
    del kwargs['n_timesteps']
    del kwargs['goal_selection_strategy']
    del kwargs['n_sampled_goal']

    # Set agent algorithm
    agent = set_agent(algo_name)
    if not agent:
        print("invalid algorithm for HER")
        return

    # the noise objects
    nb_actions = env.action_space.shape[-1]
    param_noise = None
    action_noise = None

    if noise_type:

        for current_noise_type in noise_type.split(','):

            current_noise_type = current_noise_type.strip()

            if 'adaptive-param' in current_noise_type and algo_name is 'ddpg':
                _, stddev = current_noise_type.split('_')
                param_noise = AdaptiveParamNoiseSpec(
                    initial_stddev=float(stddev),
                    desired_action_stddev=float(stddev))

            elif 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = NormalActionNoise(mean=np.zeros(nb_actions),
                                                 sigma=float(stddev) *
                                                 np.ones(nb_actions))

            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(
                    mean=np.zeros(nb_actions),
                    sigma=float(stddev) * np.ones(nb_actions))

            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))

    # Create learning rate schedule
    for key in ['learning_rate', 'learning_rate_pi', 'cliprange']:
        if key in kwargs:
            if isinstance(kwargs[key], str):
                schedule, initial_value = kwargs[key].split('_')
                initial_value = float(initial_value)
                kwargs[key] = linear_schedule(initial_value)
            elif isinstance(kwargs[key], float):
                kwargs[key] = constfn(kwargs[key])
            else:
                raise ValueError('Invalid valid for {}: {}'.format(
                    key, kwargs[key]))

    kwargs['tensorboard_log'] = os.path.join(log_dir, 'tb')
    kwargs['full_tensorboard_log'] = False
    kwargs['seed'] = seed
    kwargs['action_noise'] = action_noise
    if algo_name is 'ddpg':
        kwargs['param_noise'] = param_noise

    if 'continue' in kwargs and kwargs['continue'] is True:
        # Continue training
        print("Loading pretrained agent")
        # Policy should not be changed
        for key in ['policy', 'policy_kwargs']:
            if key in kwargs:
                del kwargs[key]

        model = HER.load(os.path.join(out_dir, 'final_model.pkl'),
                         env=env,
                         verbose=1,
                         **kwargs)
    else:
        if 'continue' in kwargs:
            del kwargs['continue']
        model = HER(policy,
                    env,
                    agent,
                    goal_selection_strategy=goal_selection_strategy,
                    n_sampled_goal=n_sampled_goal,
                    verbose=1,
                    **kwargs)

    model.learn(total_timesteps=n_timesteps, callback=log_callback)

    return model
Esempio n. 25
0
# parameters(for training)
tau = 0.1  # update rate for target model
gamma = 0.95  # discount rate for q value.
# batch_size = NUMCONC*5+3    # size of batch
batch_size = 10
alr = 0.003  # actor learning rate
clr = 0.003  # critic learning rate

# noise(to better exploration)
n_actions = env.action_space.shape[-1]
param_noise = AdaptiveParamNoiseSpec()
# action_noise = None
# param_noise = None
action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                 sigma=float(0.5) *
                                 np.ones(n_actions))  # A gaussian action noise

# model(DDPG)
#        Deep Deterministic Policy Gradient Algorithms.
#        DDPG is the combination of Nature DQN、Actor-Critic and DPG, it is designed to tackle continuous action space problems.
#        Policy-learning
#        The policy function(actor) takes state as input and is updated according to policy gradient.
#        Q-learning
#        The value function(critic) take state and action as input and is adjusted to minimize the loss.
#        Q-learning algorithm for function approximator is largely based on minimizing this MSBE loss function, with two main tricks, viz replay buffer and targrt network.
#        The replay buffer is used to store experience, because DDPG is an off-policy algorithm.
#        A target network is designed to minimize MSBE loss.
#        A target policy network to compute an action which approximately maximizes Q_{\phi_{\text{targ}}}.
#        Ornstein-Uhlenbeck process is applied to add exploration noise during training to make DDPG policies explore better.
Esempio n. 26
0
def main(args):
    log_dir = args.log_path if (
        args.log_path is not None
    ) else "/tmp/stable_baselines_" + time.strftime('%Y-%m-%d-%H-%M-%S')
    if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
        rank = 0
        configure_logger(log_dir)
    else:
        rank = MPI.COMM_WORLD.Get_rank()
        configure_logger(log_dir, format_strs=[])

    set_global_seeds(args.seed)

    model_class = SAC_SIR  # works also with SAC, DDPG and TD3

    env_kwargs = get_env_kwargs(args.env,
                                random_ratio=args.random_ratio,
                                sequential=args.sequential,
                                reward_type=args.reward_type,
                                n_object=args.n_object)

    def make_thunk(rank):
        return lambda: make_env(
            env_id=args.env, rank=rank, log_dir=log_dir, kwargs=env_kwargs)

    env = ParallelSubprocVecEnv(
        [make_thunk(i) for i in range(args.num_workers)], reset_when_done=True)

    def make_thunk_aug(rank):
        return lambda: FlattenDictWrapper(
            make_env(env_id=aug_env_name, rank=rank, kwargs=aug_env_kwargs),
            ['observation', 'achieved_goal', 'desired_goal'])

    aug_env_kwargs = env_kwargs.copy()
    del aug_env_kwargs['max_episode_steps']
    aug_env_name = args.env.split('-')[0] + 'Unlimit-' + args.env.split('-')[1]
    aug_env = ParallelSubprocVecEnv(
        [make_thunk_aug(i) for i in range(args.num_workers)],
        reset_when_done=False)

    if os.path.exists(os.path.join(logger.get_dir(), 'eval.csv')):
        os.remove(os.path.join(logger.get_dir(), 'eval.csv'))
        print('Remove existing eval.csv')
    eval_env_kwargs = env_kwargs.copy()
    eval_env_kwargs['random_ratio'] = 0.0
    eval_env = make_env(env_id=args.env, rank=0, kwargs=eval_env_kwargs)
    eval_env = FlattenDictWrapper(
        eval_env, ['observation', 'achieved_goal', 'desired_goal'])

    if not args.play:
        os.makedirs(log_dir, exist_ok=True)

    # Available strategies (cf paper): future, final, episode, random
    goal_selection_strategy = 'future'  # equivalent to GoalSelectionStrategy.FUTURE

    if not args.play:
        from stable_baselines.ddpg.noise import NormalActionNoise
        noise_type = args.action_noise.split('_')[0]
        if noise_type == 'none':
            parsed_action_noise = None
        elif noise_type == 'normal':
            sigma = float(args.action_noise.split('_')[1])
            parsed_action_noise = NormalActionNoise(
                mean=np.zeros(env.action_space.shape),
                sigma=sigma * np.ones(env.action_space.shape))
        else:
            raise NotImplementedError

        train_kwargs = get_train_kwargs("sac_sir", args, parsed_action_noise,
                                        eval_env, aug_env)

        def callback(_locals, _globals):
            if _locals['step'] % int(1e3) == 0:
                if 'FetchStack' in args.env:
                    mean_eval_reward = stack_eval_model(
                        eval_env,
                        _locals["self"],
                        init_on_table=(args.env == 'FetchStack-v2'))
                elif 'MasspointPushDoubleObstacle-v2' in args.env:
                    mean_eval_reward = egonav_eval_model(
                        eval_env,
                        _locals["self"],
                        env_kwargs["random_ratio"],
                        fixed_goal=np.array([4., 4., 0.15, 0., 0., 0., 1.]))
                    mean_eval_reward2 = egonav_eval_model(
                        eval_env,
                        _locals["self"],
                        env_kwargs["random_ratio"],
                        goal_idx=0,
                        fixed_goal=np.array([4., 4., 0.15, 1., 0., 0., 0.]))
                    log_eval(_locals['self'].num_timesteps,
                             mean_eval_reward2,
                             file_name="eval_box.csv")
                else:
                    mean_eval_reward = eval_model(eval_env, _locals["self"])
                log_eval(_locals['self'].num_timesteps, mean_eval_reward)
            if _locals['step'] % int(2e4) == 0:
                model_path = os.path.join(
                    log_dir, 'model_' + str(_locals['step'] // int(2e4)))
                model.save(model_path)
                print('model saved to', model_path)
            return True

        class CustomSACPolicy(SACPolicy):
            def __init__(self, *model_args, **model_kwargs):
                super(CustomSACPolicy, self).__init__(
                    *model_args,
                    **model_kwargs,
                    layers=[256, 256] if 'MasspointPushDoubleObstacle'
                    in args.env else [256, 256, 256, 256],
                    feature_extraction="mlp")

        register_policy('CustomSACPolicy', CustomSACPolicy)
        from utils.sac_attention_policy import AttentionPolicy
        register_policy('AttentionPolicy', AttentionPolicy)
        policy_kwargs = get_policy_kwargs("sac_sir", args)

        if rank == 0:
            print('train_kwargs', train_kwargs)
            print('policy_kwargs', policy_kwargs)
        # Wrap the model
        model = HER2(args.policy,
                     env,
                     model_class,
                     n_sampled_goal=4,
                     start_augment_time=args.start_augment,
                     goal_selection_strategy=goal_selection_strategy,
                     num_workers=args.num_workers,
                     policy_kwargs=policy_kwargs,
                     verbose=1,
                     **train_kwargs)
        print(model.get_parameter_list())

        # Train the model
        model.learn(
            int(args.num_timesteps),
            seed=args.seed,
            callback=callback,
            log_interval=100 if not ('MasspointMaze-v3' in args.env) else 10)

        if rank == 0:
            model.save(os.path.join(log_dir, 'final'))