Ejemplo n.º 1
0
def run_test(config):
    """Stable baselines test

    Mandatory configuration settings:
        - 'continuous' agent
        - camera_settings enabled
        - stable_baselines enabled
    """
    env = None
    try:
        # Create Environment
        env = make_env(config)
        env = DummyVecEnv([lambda: env])

        # Initialize DDPG and start learning
        n_actions = env.action_space.shape[-1]
        param_noise = None
        action_noise = OrnsteinUhlenbeckActionNoise(
            mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
        model = DDPG(CnnPolicy, env, verbose=1, param_noise=param_noise,
                     action_noise=action_noise, random_exploration=0.8)
        model.learn(total_timesteps=10000)

    finally:
        if env:
            env.close()
        else:
            clear_carla(config.host, config.port)
        print("-----Carla Environment is closed-----")
Ejemplo n.º 2
0
    def _init_environment(self,datapath,window_size):

        df = pd.read_csv(datapath)
        bid_price_columns = [i for i in range(1,len(df.columns),20)]
        print(bid_price_columns)
        ask_price_columns = [i for i in range(3,len(df.columns),20)]
        bidPrices = df[df.columns[bid_price_columns]]
        askPrices = df[df.columns[bid_price_columns]]
        df_concat = pd.concat([bidPrices, askPrices])
        midPrices = df_concat.groupby(df_concat.index).mean().transpose().values[-len(self.securities):]
        print(midPrices[:,0])

        self.env = DummyVecEnv([lambda: securities_trading_env(np.array(midPrices).T)])
        self.env = VecCheckNan(self.env, raise_exception=True)

        n_actions = self.env.action_space.shape[-1]
        param_noise = None
        action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
        print(n_actions)

        if(self.policy == "DDPG"):
           self.model = DDPG(ddpgMlpPolicy, self.env, verbose=int(self.verbose), param_noise=param_noise, action_noise= action_noise)
        elif(self.policy=="TD3"):
            self.model = TD3(td3MlpPolicy, self.env, verbose=int(self.verbose))
        elif(self.policy=="GAIL"):
            self.model = TD3(td3MlpPolicy, self.env, verbose=int(self.verbose))
        else:
            self.model = PPO2(MlpLnLstmPolicy, self.env, verbose=int(self.verbose))

        if self.load: #load model
            self.model = self.model.load("save/"+modelpath+".h5")

        #init model class
        self.gym_model = Agent(market_event_securities, market_event_queue, securities, queue, host, policy,strategy, cash_balance,self.model,self.env,window_size,self.inventory)
def main():
    # unpause Simulation so that robot receives data on all topics
    gazebo_connection.GazeboConnection().unpauseSim()
    # create node
    rospy.init_node('pickbot_gym', anonymous=True, log_level=rospy.FATAL)

    env = gym.make('Pickbot-v1')

    # the noise objects for DDPG
    n_actions = env.action_space.shape[-1]
    param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.5) *
                                                np.ones(n_actions))

    model = DDPG(MlpPolicy,
                 env,
                 verbose=1,
                 param_noise=param_noise,
                 action_noise=action_noise)
    model.learn(total_timesteps=200000)

    print("Saving model to pickbot_model_ddpg_continuous_" + timestamp +
          ".pkl")
    model.save("pickbot_model_ddpg_continuous_" + timestamp)
Ejemplo n.º 4
0
def test_identity_ddpg():
    """
    Test if the algorithm (with a given policy)
    can learn an identity transformation (i.e. return observation as an action)
    """
    env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)])

    std = 0.2
    param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(std),
                                         desired_action_stddev=float(std))

    model = DDPG("MlpPolicy",
                 env,
                 gamma=0.0,
                 param_noise=param_noise,
                 memory_limit=int(1e6))
    model.learn(total_timesteps=20000, seed=0)

    n_trials = 1000
    reward_sum = 0
    set_global_seeds(0)
    obs = env.reset()
    for _ in range(n_trials):
        action, _ = model.predict(obs)
        obs, reward, _, _ = env.step(action)
        reward_sum += reward
    assert reward_sum > 0.9 * n_trials
    # Free memory
    del model, env
Ejemplo n.º 5
0
def main(output_folder_path:Path):
    # Set gym-carla environment
    agent_config = AgentConfig.parse_file(Path("configurations/agent_configuration.json"))
    carla_config = CarlaConfig.parse_file(Path("configurations/carla_configuration.json"))

    params = {
        "agent_config": agent_config,
        "carla_config": carla_config,
        "ego_agent_class": RLPIDAgent,
        "max_collision": 5
    }

    env = gym.make('roar-pid-v0', params=params)
    env.reset()

    model_params: dict = {
        "verbose": 1,
        "render": True,
        "tensorboard_log": (output_folder_path / "tensorboard").as_posix()
    }
    latest_model_path = find_latest_model(output_folder_path)
    if latest_model_path is None:
        model = DDPG(LnMlpPolicy, env=env, **model_params)  # full tensorboard log can take up space quickly
    else:
        model = DDPG.load(latest_model_path, env=env, **model_params)
        model.render = True
        model.tensorboard_log = (output_folder_path / "tensorboard").as_posix()

    logging_callback = LoggingCallback(model=model)
    checkpoint_callback = CheckpointCallback(save_freq=1000, verbose=2, save_path=(output_folder_path / "checkpoints").as_posix())
    event_callback = EveryNTimesteps(n_steps=100, callback=checkpoint_callback)
    callbacks = CallbackList([checkpoint_callback, event_callback, logging_callback])
    model = model.learn(total_timesteps=int(1e10), callback=callbacks, reset_num_timesteps=False)
    model.save(f"pid_ddpg_{datetime.now()}")
Ejemplo n.º 6
0
def ddpg(env_id,
         timesteps,
         policy="MlpPolicy",
         log_interval=None,
         tensorboard_log=None,
         seed=None,
         load_weights=None):
    from stable_baselines import DDPG

    env = gym.make(env_id)

    n_actions = env.action_space.shape[-1]
    param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.5) *
                                                np.ones(n_actions))

    if load_weights is not None:
        model = DDPG.load(load_weights, env=env)
    else:
        model = DDPG(policy,
                     env,
                     verbose=1,
                     param_noise=param_noise,
                     action_noise=action_noise,
                     tensorboard_log=tensorboard_log)

    callback = WandbRenderEnvCallback(model_name="ddpg", env_name=env_id)

    model.learn(total_timesteps=timesteps,
                log_interval=log_interval,
                callback=callback)
    save_model_weights(model, "ddpg", env_id, policy, seed=seed, path=".")
def train_policy_ddpg(env,
                      policy,
                      policy_args,
                      total_timesteps,
                      verbose=0,
                      actor_lr=.5,
                      critic_lr=.001):
    """
    Parameters
    ----------
    env : vectorized set of EncoderWrapper of a TimeLimit wrapper of a restartable env.
    policy : ddpg policy class
    policy_args : dict of keyword arguments for policy class
    total_timesteps : int, how many timesteps to train policy (i.e. 200000)
    """
    # the noise objects for DDPG
    n_actions = env.action_space.shape[-1]
    param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.5) *
                                                np.ones(n_actions))

    model = DDPG(policy,
                 env,
                 verbose=verbose,
                 param_noise=param_noise,
                 action_noise=action_noise,
                 policy_kwargs=policy_args,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr)
    #model = PPO2(policy, env)
    model.learn(total_timesteps)
    return model
Ejemplo n.º 8
0
 def explore(app,
             emulator,
             appium,
             timesteps,
             timer,
             save_policy,
             policy_dir,
             cycle,
             nb_train_steps=10,
             random_exploration=0.7):
     try:
         env = TimeFeatureWrapper(app)
         model = DDPG(MlpPolicy,
                      env,
                      verbose=1,
                      random_exploration=random_exploration,
                      nb_train_steps=nb_train_steps)
         callback = TimerCallback(timer=timer)
         model.learn(total_timesteps=timesteps, callback=callback)
         if save_policy:
             model.save(f'{policy_dir}{os.sep}{cycle}')
         return True
     except Exception:
         appium.restart_appium()
         if emulator is not None:
             emulator.restart_emulator()
         return False
Ejemplo n.º 9
0
def main(env: PSMCartesianDDPGEnv):
    # the noise objects for DDPG
    n_actions = env.action.action_space.shape[0]
    param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.5) *
                                                np.ones(n_actions))

    model = DDPG(MlpPolicy,
                 env,
                 gamma=0.95,
                 verbose=1,
                 nb_train_steps=300,
                 nb_rollout_steps=150,
                 param_noise=param_noise,
                 batch_size=128,
                 action_noise=action_noise,
                 random_exploration=0.05,
                 normalize_observations=True,
                 tensorboard_log="./ddpg_dvrk_tensorboard/",
                 observation_range=(-1.5, 1.5),
                 critic_l2_reg=0.01)

    model.learn(total_timesteps=4000000,
                log_interval=100,
                callback=CheckpointCallback(
                    save_freq=100000, save_path="./ddpg_dvrk_tensorboard/"))
    model.save("./ddpg_robot_env")
def main():

    # create Environment
    env = iCubPushGymEnv(urdfRoot=robot_data.getDataPath(), renders=False, useIK=1,
                        isDiscrete=0, rnd_obj_pose=0, maxSteps=2000, reward_type=0)

    # set seed
    seed = 1
    tf.reset_default_graph()
    set_global_seed(seed)
    env.seed(seed)

    # set log
    monitor_dir = os.path.join(log_dir,'log')
    os.makedirs(monitor_dir, exist_ok=True)
    env = Monitor(env, monitor_dir+'/', allow_early_resets=True)

    # create agent model
    nb_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(0.5373) * np.ones(nb_actions))

    model = DDPG('LnMlpPolicy', env, action_noise=action_noise, gamma=0.99, batch_size=16,
                normalize_observations=True,normalize_returns=False, memory_limit=100000,
                verbose=1, tensorboard_log=os.path.join(log_dir,'tb'),full_tensorboard_log=False)

    #start learning
    model.learn(total_timesteps=500000, seed=seed, callback=callback)

    # save model
    print("Saving model.pkl to ",log_dir)
    act.save(log_dir+"/final_model.pkl")
Ejemplo n.º 11
0
def run_agent(envs, parameters):
    '''Train an agent.'''
    alg = parameters['alg']
    learning_rate = parameters['learning_rate']
    gamma = parameters['gamma']
    model_path = parameters['model_path']
    set_global_seeds(parameters.get('seed'))
    dummy_env = OptVecEnv(envs)
    if alg == 'PPO':
        model = PPO2(MlpPolicy,
                     dummy_env,
                     gamma=gamma,
                     learning_rate=learning_rate,
                     verbose=1,
                     nminibatches=dummy_env.num_envs)
    elif alg == 'A2C':
        model = A2C(MlpPolicy,
                    dummy_env,
                    gamma=gamma,
                    learning_rate=learning_rate,
                    verbose=1)
    else:
        model = DDPG(ddpg.MlpPolicy,
                     dummy_env,
                     gamma=gamma,
                     verbose=1,
                     actor_lr=learning_rate / 10,
                     critic_lr=learning_rate)
    try:
        model.learn(total_timesteps=parameters.get('total_timesteps', 10**6))
    except tf.errors.InvalidArgumentError:
        LOGGER.error('Possible Nan, %s', str((alg, learning_rate, gamma)))
    finally:
        dummy_env.close()
        model.save(str(model_path))
Ejemplo n.º 12
0
    def train_DDPG(self, model_name, model_params=config.DDPG_PARAMS):
        """DDPG model"""
        from stable_baselines import DDPG
        from stable_baselines.ddpg.policies import DDPGPolicy
        from stable_baselines.common.noise import OrnsteinUhlenbeckActionNoise

        env_train = self.env

        n_actions = env_train.action_space.shape[-1]
        param_noise = None
        action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                    sigma=float(0.5) *
                                                    np.ones(n_actions))

        start = time.time()
        model = DDPG('MlpPolicy',
                     env_train,
                     batch_size=model_params['batch_size'],
                     buffer_size=model_params['buffer_size'],
                     param_noise=param_noise,
                     action_noise=action_noise,
                     verbose=model_params['verbose'])
        model.learn(total_timesteps=model_params['timesteps'])
        end = time.time()

        model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
        print('Training time (DDPG): ', (end - start) / 60, ' minutes')
        return model
Ejemplo n.º 13
0
def train_agent_with_ddpg(load):
    from stable_baselines.ddpg.policies import FeedForwardPolicy
    from stable_baselines.common.vec_env import DummyVecEnv
    from stable_baselines.ddpg.noise import OrnsteinUhlenbeckActionNoise
    from stable_baselines import DDPG

    # Create and wrap the environment
    env = gym.make('F16GCAS-v0')
    env = DummyVecEnv([lambda: env])

    # the noise objects for DDPG
    n_actions = env.action_space.shape[-1]
    param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.01) * np.ones(n_actions))

    # Custom MLP policy of two layers of size 16 each
    class CustomPolicy(FeedForwardPolicy):
        def __init__(self, *args, **kwargs):
            super(CustomPolicy, self).__init__(*args, **kwargs,
                                               layers=[128, 128],
                                               layer_norm=False,
                                               feature_extraction="mlp")

    model = DDPG(CustomPolicy, env, verbose=1, action_noise=action_noise)

    if not load:
        ExpData = ExpertDataset("./lqr_export.npz")
        model.pretrain(ExpData, n_epochs=100)
        model.save(ROOT+"/trained_models/TDRL/f16/ddpg/128_128")
    else:
        model = DDPG.load(ROOT+"/trained_models/TDRL/f16/ddpg/128_128", policy=CustomPolicy, env=env)

    return model
Ejemplo n.º 14
0
def main(env):

    n_actions = env.action_space.shape[0]
    param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.5) *
                                                np.ones(n_actions))

    # Using only one expert trajectory
    # you can specify `traj_limitation=-1` for using the whole dataset
    file_dir = "/home/vignesh/Thesis_Suture_data/trial2/ambf_data/"
    dataset = ExpertDataset(expert_path=file_dir + 'expert_psm_data.npz',
                            traj_limitation=1,
                            batch_size=32)

    model = DDPG(MlpPolicy,
                 env,
                 gamma=0.95,
                 verbose=1,
                 nb_train_steps=300,
                 nb_rollout_steps=150,
                 param_noise=param_noise,
                 batch_size=128,
                 action_noise=action_noise,
                 random_exploration=0.05,
                 normalize_observations=True,
                 tensorboard_log="./ddpg_dvrk_tensorboard/",
                 observation_range=(-1.5, 1.5))

    model.pretrain(dataset, n_epochs=1000)
    model.save("./gail_robot_env")
def test_ddpg_normalization():
    """
    Test that observations and returns normalizations are properly saved and loaded.
    """
    param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.05,
                                         desired_action_stddev=0.05)
    model = DDPG('MlpPolicy',
                 'Pendulum-v0',
                 memory_limit=50000,
                 normalize_observations=True,
                 normalize_returns=True,
                 nb_rollout_steps=128,
                 nb_train_steps=1,
                 batch_size=64,
                 param_noise=param_noise)
    model.learn(1000)
    obs_rms_params = model.sess.run(model.obs_rms_params)
    ret_rms_params = model.sess.run(model.ret_rms_params)
    model.save('./test_ddpg.zip')

    loaded_model = DDPG.load('./test_ddpg.zip')
    obs_rms_params_2 = loaded_model.sess.run(loaded_model.obs_rms_params)
    ret_rms_params_2 = loaded_model.sess.run(loaded_model.ret_rms_params)

    for param, param_loaded in zip(obs_rms_params + ret_rms_params,
                                   obs_rms_params_2 + ret_rms_params_2):
        assert np.allclose(param, param_loaded)

    del model, loaded_model

    if os.path.exists("./test_ddpg.zip"):
        os.remove("./test_ddpg.zip")
Ejemplo n.º 16
0
def train_DDPG(env_train, model_name, timesteps=50000):
    """DDPG model"""

    start = time.time()
    model = DDPG('MlpPolicy', env_train)
    model.learn(total_timesteps=timesteps)
    end = time.time()

    model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
    print('Training time (DDPG): ', (end - start) / 60, ' minutes')
    return model
def launchAgent(model_name: str):
    """
    :param model_name: 실행시킬 모델의 종류. HER, DDPG, PPO2 혹은 기타값(DQN)이어야 함
                        현재는 의도상 PPO2로 세팅할 것
    :return: 1000회의 사이클을 돌고 난 이후의 모델
    """
    import Reinforcement_AI.env.e_enhanced_image_env as image_env
    from stable_baselines import DQN, HER, DDPG, PPO2
    from stable_baselines.common import make_vec_env

    print("Current Env is " + model_name)

    if model_name == "HER":
        env = image_env.DetailedMiniMapEnv()
        model = HER("CnnPolicy", env=env, model_class=DQN)
    if model_name == "DDPG":
        env = image_env.DDPGImageEnv()
        model = DDPG(policy="CnnPolicy", env=env, normalize_observations=True)
    if model_name == "PPO2":
        env = make_vec_env(image_env.DetailedMiniMapEnv, n_envs=1)
        model = PPO2(policy="CnnPolicy", env=env, verbose=1)
    else:
        env = image_env.DetailedMiniMapEnv()
        model = DQN(
            "CnnPolicy",  # policy
            env=env,  # environment
            double_q=True,  # Double Q enable
            prioritized_replay=True,  # Replay buffer enabled
            verbose=0  # log print
        )

    for i in range(1000):
        if i != 0:
            if model_name == "HER":
                model = HER.load("detailedmap_HER_" + str(i), env)
            if model_name == "DDPG":
                model = DDPG.load("detailedmap_DDPG_" + str(i), env)
            if model_name == "PPO2":
                model = PPO2.load("detailedmap_PPO2_" + str(i), env)
            else:
                model = DQN.load("detailedmap_DQN_" + str(i), env)

        # print('model learn start')
        model.learn(total_timesteps=12500)  #FPS가 130이상 넘어갈때의 최소수치
        print("this model is : detailedmap_" + model_name + "_" + str(i + 1))
        # print('model learn finished')

        # print('model save start')
        model.save("detailedmap_" + model_name + "_" + str(i + 1))
        del model
        # print('model save end')

    return model
Ejemplo n.º 18
0
def ddpg(env, seed):
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.1) *
                                                np.ones(n_actions))

    return DDPG('MlpPolicy',
                env,
                action_noise=action_noise,
                verbose=1,
                tensorboard_log="./data/runs",
                seed=seed)
Ejemplo n.º 19
0
def DDPGAgent(multi_stock_env, num_episodes):
    models_folder = 'saved_models'
    rewards_folder = 'saved_rewards'

    env = DummyVecEnv([lambda: multi_stock_env])
    
    # the noise objects for DDPG
    n_actions = env.action_space.shape[-1]
    param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
    
    # Hyper parameters
    GAMMA = 0.99
    TAU = 0.001
    BATCH_SIZE = 16
    ACTOR_LEARNING_RATE = 0.0001
    CRITIC_LEARNING_RATE = 0.001
    BUFFER_SIZE = 500

    print("\nRunning DDPG Agent...\n")
    model = DDPG(MlpPolicy, env, 
                gamma = GAMMA, tau = TAU, batch_size = BATCH_SIZE,
                actor_lr = ACTOR_LEARNING_RATE, critic_lr = CRITIC_LEARNING_RATE,
                buffer_size = BUFFER_SIZE, verbose=1, 
                param_noise=param_noise, action_noise=action_noise)
    model.learn(total_timesteps=50000)
    model.save(f'{models_folder}/rl/ddpg.h5')

    del model
    
    model = DDPG.load(f'{models_folder}/rl/ddpg.h5')
    obs = env.reset()
    portfolio_value = []

    for e in range(num_episodes):
        action, _states = model.predict(obs)
        next_state, reward, done, info = env.step(action)
        print(f"episode: {e + 1}/{num_episodes}, episode end value: {info[0]['cur_val']:.2f}")
        portfolio_value.append(round(info[0]['cur_val'], 3))

    # save portfolio value for each episode
    np.save(f'{rewards_folder}/rl/ddpg.npy', portfolio_value)

    print("\nDDPG Agent run complete and saved!")

    a = np.load(f'./saved_rewards/rl/ddpg.npy')

    print(f"\nCumulative Portfolio Value Average reward: {a.mean():.2f}, Min: {a.min():.2f}, Max: {a.max():.2f}")
    plt.plot(a)
    plt.title("Portfolio Value Per Episode (DDPG)")
    plt.ylabel("Portfolio Value")
    plt.xlabel("Episodes")
    plt.show()
Ejemplo n.º 20
0
def train_ddpg():
    env = gimbal(5, 500)
    env = DummyVecEnv([lambda: env])
    eval_env = gimbal(5, 500)
    eval_env = DummyVecEnv([lambda: eval_env])

    # the noise objects for DDPG
    n_actions = env.action_space.shape[-1]
    param_noise = None
    action_noise = None

    model = DDPG(policy=MlpPolicy,
                 env=env,
                 gamma=0.99,
                 memory_policy=None,
                 eval_env=eval_env,
                 nb_train_steps=500,
                 nb_rollout_steps=500,
                 nb_eval_steps=500,
                 param_noise=param_noise,
                 action_noise=action_noise,
                 normalize_observations=False,
                 tau=0.001,
                 batch_size=128,
                 param_noise_adaption_interval=50,
                 normalize_returns=False,
                 enable_popart=False,
                 observation_range=(-5000.0, 5000.0),
                 critic_l2_reg=0.0,
                 return_range=(-inf, inf),
                 actor_lr=0.0001,
                 critic_lr=0.001,
                 clip_norm=None,
                 reward_scale=1.0,
                 render=False,
                 render_eval=False,
                 memory_limit=50000,
                 verbose=1,
                 tensorboard_log="./logs",
                 _init_setup_model=True,
                 policy_kwargs=None,
                 full_tensorboard_log=False)
    #model = DDPG.load("./models/baseline_ddpg_t2")
    #model.set_env(env)
    model.learn(total_timesteps=1000000,
                callback=None,
                seed=None,
                log_interval=100,
                tb_log_name='DDPG',
                reset_num_timesteps=True)
    model.save("./models/baseline_ddpg_t2")
Ejemplo n.º 21
0
    def run(self):
        self._init()

        env = self.env
        model = self.model
        objective = self.objective

        if objective == "infogain":
            wenv = InfogainEnv(env, model)
        elif objective == "prederr":
            wenv = PrederrEnv(env, model)
        else:
            raise AttributeError(
                "Objective '{}' is unknown. Needs to be 'infogain' or 'prederr'"
                .format(objective))

        wenv.max_episode_len = self.horizon
        wenv.end_episode_callback = self._end_episode
        dvenv = DummyVecEnv([lambda: wenv])

        if self.rl_algo == "ddpg":
            self.logger.info("Setting up DDPG as model-free RL algorithm.")
            pn = AdaptiveParamNoiseSpec()
            an = NormalActionNoise(np.array([0]), np.array([1]))
            rl_model = DDPG(DDPGMlpPolicy,
                            dvenv,
                            verbose=1,
                            render=False,
                            action_noise=an,
                            param_noise=pn,
                            nb_rollout_steps=self.horizon,
                            nb_train_steps=self.horizon)
        elif self.rl_algo == "sac":
            self.logger.info("Setting up SAC as model-free RL algorithm.")
            rl_model = SAC(SACMlpPolicy,
                           dvenv,
                           verbose=1,
                           learning_starts=self.horizon)
        else:
            raise AttributeError(
                "Model-free RL algorithm '{}' is unknown.".format(
                    self.rl_algo))

        # Train the agent
        max_steps_total = self.horizon * self.n_episodes * 100
        try:
            self.logger.info("Start the agent")
            rl_model.learn(total_timesteps=max_steps_total, seed=self.seed)
        except MaxEpisodesReachedException:
            print("Exploration finished.")
def test_ddpg_eval_env():
    """
    Additional test to check that everything is working when passing
    an eval env.
    """
    eval_env = gym.make("Pendulum-v0")
    model = DDPG("MlpPolicy",
                 "Pendulum-v0",
                 nb_rollout_steps=5,
                 nb_train_steps=2,
                 nb_eval_steps=10,
                 eval_env=eval_env,
                 verbose=0)
    model.learn(1000)
Ejemplo n.º 23
0
def main(output_folder_path: Path):
    # Set gym-carla environment
    agent_config = AgentConfig.parse_file(
        Path("configurations/agent_configuration.json"))
    carla_config = CarlaConfig.parse_file(
        Path("configurations/carla_configuration.json"))

    params = {
        "agent_config": agent_config,
        "carla_config": carla_config,
        "ego_agent_class": RLLocalPlannerAgent,
        "max_collision": 5,
    }

    env = gym.make('roar-local-planner-v0', params=params)
    env.reset()

    model_params: dict = {
        "verbose": 1,
        "render": True,
        "env": env,
        "n_cpu_tf_sess": None,
        "buffer_size": 1000,
        "nb_train_steps": 50,
        "nb_rollout_steps": 100,
        # "nb_eval_steps": 50,
        "batch_size": 32,
    }
    latest_model_path = find_latest_model(Path(output_folder_path))
    if latest_model_path is None:
        model = DDPG(CnnPolicy, **model_params)
    else:
        model = DDPG.load(latest_model_path, **model_params)
    tensorboard_dir = (output_folder_path / "tensorboard")
    ckpt_dir = (output_folder_path / "checkpoints")
    tensorboard_dir.mkdir(parents=True, exist_ok=True)
    ckpt_dir.mkdir(parents=True, exist_ok=True)
    model.tensorboard_log = tensorboard_dir.as_posix()
    model.render = True
    logging_callback = LoggingCallback(model=model)
    checkpoint_callback = CheckpointCallback(save_freq=1000,
                                             verbose=2,
                                             save_path=ckpt_dir.as_posix())
    event_callback = EveryNTimesteps(n_steps=100, callback=checkpoint_callback)
    callbacks = CallbackList(
        [checkpoint_callback, event_callback, logging_callback])
    model = model.learn(total_timesteps=int(1e10),
                        callback=callbacks,
                        reset_num_timesteps=False)
    model.save(f"local_planner_ddpg_{datetime.now()}")
def train_DDPG(env_train, model_name, timesteps=10000):
    """DDPG model"""
    # the noise objects for DDPG
    n_actions = env_train.action_space.shape[-1]
    param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))

    start = time.time()
    model = DDPG('MlpPolicy', env_train, param_noise=param_noise, action_noise=action_noise)
    model.learn(total_timesteps=timesteps)
    end = time.time()

    model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
    print('Training time (DDPG): ', (end - start) / 60, ' minutes')
    return model
Ejemplo n.º 25
0
    def __call__(self, trial):
        # Calculate an objective value by using the extra arguments.
        env_id = 'gym_custom:fooCont-v0'
        env = gym.make(env_id, data=self.train_data)
        env = DummyVecEnv([lambda: env])
        algo = trial.suggest_categorical('algo', ['TD3'])
        model = 0
        if algo == 'PPO2':

            policy_choice = trial.suggest_categorical('policy', [False, True])
            policy = commonMlp if policy_choice else commonMlpLstm
            model_params = optimize_ppo2(trial)

            model = PPO2(policy, env, verbose=0, nminibatches=1, **model_params)
            model.learn(276*7000)

        elif algo == 'DDPG':
            policy_choice = trial.suggest_categorical('policy', [False, True])
            policy = ddpgLnMlp
            model_params = sample_ddpg_params(trial)

            model= DDPG(policy, env, verbose=0, **model_params)
            model.learn(276*7000)

        elif algo == 'TD3':
            policy_choice = trial.suggest_categorical('policy', [False, True])
            policy = td3MLP if policy_choice else td3LnMlp
            model_params = sample_td3_params(trial)

            model = TD3(policy, env, verbose=0, **model_params)
            model.learn(276*7000*3)

        rewards = []
        reward_sum = 0.0
        env = gym.make(env_id, data=self.test_data)
        env = DummyVecEnv([lambda: env])

        obs = env.reset()
        for ep in range(1000):
            for step in range(276):
                action, _ = model.predict(obs)
                obs, reward, done, _ = env.step(action)
                reward_sum += reward

                if done:
                   rewards.append(reward_sum)
                    reward_sum = 0.0
                    obs = env.reset()
Ejemplo n.º 26
0
def main():

    env1 = KukaDiverseObjectEnv(renders=True, isDiscrete=False)
    model = DDPG(MlpPolicy, env1, verbose=1)

    # = deepq.models.mlp([64])
    model.learn(total_timesteps=500000)
    #max_timesteps=10000000,
    # exploration_fraction=0.1,
    # exploration_final_eps=0.02,
    # print_freq=10,
    # callback=callback, network='mlp')
    print("Saving model to kukadiverse_model.pkl")
    model.save("kukadiversecont_model.pkl")

    main()
Ejemplo n.º 27
0
def main(output_folder_path: Path):
    # Set gym-carla environment
    agent_config = AgentConfig.parse_file(
        Path("configurations/agent_configuration.json"))
    carla_config = CarlaConfig.parse_file(
        Path("configurations/carla_configuration.json"))

    params = {
        "agent_config": agent_config,
        "carla_config": carla_config,
        "ego_agent_class": RLLocalPlannerAgent,
        "max_collision": 5,
    }

    env = gym.make('roar-local-planner-v1', params=params)
    env.reset()

    tensorboard_dir, ckpt_dir = prep_dir(output_folder_path)
    model_params: dict = {
        "verbose": 1,
        "render": True,
        "env": env,
        "n_cpu_tf_sess": 2,
        "buffer_size": 10,
        "random_exploration": 0.1,
        "tensorboard_log": tensorboard_dir.as_posix(),
    }
    latest_model_path = find_latest_model(Path(output_folder_path))
    if latest_model_path is None:
        model = DDPG(
            LnMlpPolicy,
            **model_params)  # full tensorboard log can take up space quickly
    else:
        model = DDPG.load(latest_model_path, **model_params)

    logging_callback = LoggingCallback(model=model)
    checkpoint_callback = CheckpointCallback(save_freq=1000,
                                             verbose=2,
                                             save_path=ckpt_dir.as_posix())
    event_callback = EveryNTimesteps(n_steps=100, callback=checkpoint_callback)
    callbacks = CallbackList(
        [checkpoint_callback, event_callback, logging_callback])
    model = model.learn(total_timesteps=int(1e10),
                        callback=callbacks,
                        reset_num_timesteps=False)
    model.save(f"local_planner_v1_ddpg_{datetime.now()}")
def test_ddpg_popart():
    """
    Test DDPG with pop-art normalization
    """
    n_actions = 1
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=0.1 * np.ones(n_actions))
    model = DDPG('MlpPolicy',
                 'Pendulum-v0',
                 memory_limit=50000,
                 normalize_observations=True,
                 normalize_returns=True,
                 nb_rollout_steps=128,
                 nb_train_steps=1,
                 batch_size=64,
                 action_noise=action_noise,
                 enable_popart=True)
    model.learn(1000)
Ejemplo n.º 29
0
def ppo1_nmileg_pool(sensory_value):
	RL_method = "PPO1" 
	# total_MC_runs = 50
	experiment_ID = "handtest_rot_pool_with_MC_C_task0/"
	save_name_extension = RL_method
	total_timesteps =  500000
	sensory_info = "sensory_{}".format(sensory_value) 
	current_mc_run_num =22 #starts from 0
	for mc_cntr in range(current_mc_run_num, current_mc_run_num+1):
		log_dir = "./logs/{}/MC_{}/{}/{}/".format(experiment_ID, mc_cntr, RL_method, sensory_info)
		# defining the environments
		env = gym.make('HandManipulate-v1{}'.format(sensory_value))
		#env = gym.wrappers.Monitor(env, "./tmp/gym-results", video_callable=False, force=True)
		## setting the Monitor
		env = gym.wrappers.Monitor(env, log_dir+"Monitor/", video_callable=False, force=True, uid="Monitor_info")
		# defining the initial model
		if RL_method == "PPO1":
			model = PPO1(common_MlpPolicy, env, verbose=1, tensorboard_log=log_dir)
		elif RL_method == "PPO2":
			env = DummyVecEnv([lambda: env])
			model = PPO2(common_MlpPolicy, env, verbose=1, tensorboard_log=log_dir)
		elif RL_method == "DDPG":
			env = DummyVecEnv([lambda: env])
			n_actions = env.action_space.shape[-1]
			param_noise = None
			action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5)* 5 * np.ones(n_actions))
			model = DDPG(DDPG_MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, tensorboard_log=log_dir)
		else:
			raise ValueError("Invalid RL mode")
		# setting the environment on the model
		#model.set_env(env)
		# setting the random seed for some of the random instances
		random_seed = mc_cntr
		random.seed(random_seed)
		env.seed(random_seed)
		env.action_space.seed(random_seed)
		np.random.seed(random_seed)
		tf.random.set_random_seed(random_seed)
		# training the model
		# training the model
		model.learn(total_timesteps=total_timesteps)
		# saving the trained model
		model.save(log_dir+"/model")
	return None
Ejemplo n.º 30
0
def setup(model_params, output_folder_path):
    latest_model_path = find_latest_model(Path(output_folder_path))
    if latest_model_path is None:
        print("Creating model...")
        model = DDPG(CnnPolicy, **model_params)
    else:
        print("Loading model...")
        model = DDPG.load(latest_model_path, **model_params)
    tensorboard_dir = (output_folder_path / "tensorboard")
    ckpt_dir = (output_folder_path / "checkpoints")
    tensorboard_dir.mkdir(parents=True, exist_ok=True)
    ckpt_dir.mkdir(parents=True, exist_ok=True)
    checkpoint_callback = CheckpointCallback(save_freq=200,
                                             verbose=2,
                                             save_path=ckpt_dir.as_posix())
    # event_callback = EveryNTimesteps(n_steps=100, callback=checkpoint_callback)
    logging_callback = LoggingCallback(model=model, verbose=1)
    callbacks = CallbackList([checkpoint_callback, logging_callback])
    return model, callbacks