def _init_environment(self,datapath,window_size):

        df = pd.read_csv(datapath)
        bid_price_columns = [i for i in range(1,len(df.columns),20)]
        print(bid_price_columns)
        ask_price_columns = [i for i in range(3,len(df.columns),20)]
        bidPrices = df[df.columns[bid_price_columns]]
        askPrices = df[df.columns[bid_price_columns]]
        df_concat = pd.concat([bidPrices, askPrices])
        midPrices = df_concat.groupby(df_concat.index).mean().transpose().values[-len(self.securities):]
        print(midPrices[:,0])

        self.env = DummyVecEnv([lambda: securities_trading_env(np.array(midPrices).T)])
        self.env = VecCheckNan(self.env, raise_exception=True)

        n_actions = self.env.action_space.shape[-1]
        param_noise = None
        action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
        print(n_actions)

        if(self.policy == "DDPG"):
           self.model = DDPG(ddpgMlpPolicy, self.env, verbose=int(self.verbose), param_noise=param_noise, action_noise= action_noise)
        elif(self.policy=="TD3"):
            self.model = TD3(td3MlpPolicy, self.env, verbose=int(self.verbose))
        elif(self.policy=="GAIL"):
            self.model = TD3(td3MlpPolicy, self.env, verbose=int(self.verbose))
        else:
            self.model = PPO2(MlpLnLstmPolicy, self.env, verbose=int(self.verbose))

        if self.load: #load model
            self.model = self.model.load("save/"+modelpath+".h5")

        #init model class
        self.gym_model = Agent(market_event_securities, market_event_queue, securities, queue, host, policy,strategy, cash_balance,self.model,self.env,window_size,self.inventory)
Esempio n. 2
0
def td3(env_id,
        timesteps,
        policy="MlpPolicy",
        log_interval=None,
        tensorboard_log=None,
        seed=None,
        load_weights=None):
    from stable_baselines.ddpg.noise import NormalActionNoise
    env = gym.make(env_id)

    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=0.1 * np.ones(n_actions))

    if load_weights is not None:
        model = TD3.load(load_weights, env, verbose=0)
    else:
        model = TD3(policy,
                    env,
                    action_noise=action_noise,
                    verbose=1,
                    tensorboard_log=tensorboard_log)

    callback = WandbRenderEnvCallback(model_name="td3", env_name=env_id)

    model.learn(total_timesteps=timesteps,
                log_interval=log_interval,
                callback=callback)
Esempio n. 3
0
def get_TD3_model(model_settings, model_path, ckpt_path, ckpt_step, tb_path):
    policy_kwargs = dict(layers=model_settings['NET_LAYERS'])
    env = get_single_process_env(model_settings, model_path, ckpt_step)
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=0.1 * np.ones(n_actions))
    if ckpt_path is not None:
        print("Loading model from checkpoint '{}'".format(ckpt_path))
        model = TD3.load(ckpt_path,
                         env=env,
                         _init_setup_model=True,
                         policy_kwargs=policy_kwargs,
                         **model_settings['train_configs'],
                         action_noise=action_noise,
                         verbose=1,
                         tensorboard_log=tb_path)
        model.num_timesteps = ckpt_step
    else:
        model = TD3(TD3MlpPolicy,
                    env,
                    _init_setup_model=True,
                    policy_kwargs=policy_kwargs,
                    action_noise=action_noise,
                    **model_settings['train_configs'],
                    verbose=1,
                    tensorboard_log=tb_path)

    return model, env
Esempio n. 4
0
def train_TD3(env, out_dir, seed=None, **kwargs):

    # Logs will be saved in log_dir/monitor.csv
    global output_dir,log_dir
    output_dir = out_dir
    log_dir = os.path.join(out_dir, 'log')
    os.makedirs(log_dir, exist_ok=True)
    env = Monitor(env, log_dir+'/', allow_early_resets=True)

    policy = kwargs['policy']
    n_timesteps = kwargs['n_timesteps']
    noise_type = kwargs['noise_type']
    del kwargs['policy']
    del kwargs['n_timesteps']
    del kwargs['noise_type']

    ''' Parameter space noise:
    injects randomness directly into the parameters of the agent, altering the types of decisions it makes
    such that they always fully depend on what the agent currently senses. '''

    # the noise objects for TD3
    nb_actions = env.action_space.shape[-1]
    action_noise = None
    if not noise_type is None:

        for current_noise_type in noise_type.split(','):

            current_noise_type = current_noise_type.strip()

            if 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))

            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))

            else:
                raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

    if 'continue' in kwargs and kwargs['continue'] is True:
        # Continue training
        print("Loading pretrained agent")
        # Policy should not be changed
        del kwargs['policy']
        model = TD3.load(os.path.join(out_dir,'final_model.pkl'), env=env,
                         tensorboard_log=os.path.join(log_dir,'tb'), verbose=1, **kwargs)
    else:
        if 'continue' in kwargs:
            del kwargs['continue']
        model = TD3(policy, env, action_noise=action_noise, seed=seed,
                verbose=1, tensorboard_log=os.path.join(log_dir,'tb'),full_tensorboard_log=False, **kwargs)

    model.learn(total_timesteps=n_timesteps, callback=log_callback)

    return model
Esempio n. 5
0
def model_training_learning(env_train, model_name, timesteps=100000):

    # train model
    os.chdir("./model_saved/" + model_name)
    start = time.time()
    print("Train ", model_name, " Model with MlpPolicy: ")

    if model_name == "A2C_Model":
        model = A2C('MlpPolicy', env_train, verbose=0)
    elif model_name == "PPO_Model":
        model = PPO2('MlpPolicy', env_train, verbose=0)
    elif model_name == "TD3_Model":
        model = TD3('MlpPolicy', env_train, verbose=0)
    elif model_name == "SAC_Model":
        model = SAC('MlpPolicy', env_train, verbose=0)

    print("Learning ", model_name, " time steps: ", timesteps)

    model.learn(total_timesteps=timesteps)
    print("TD3 Model learning completed: ")
    end = time.time()
    timestamp = time.strftime('%b-%d-%Y_%H%M')
    model_file_name = (model_name + timestamp)
    model.save(model_file_name)
    print("- ", model_name, " save finish     :")
    print("Training time  ", model_name, " : ", (end - start) / 60, " minutes")

    os.chdir("./..")
    os.chdir("./..")
    return model
    def __call__(self):

        policy_kwargs = dict(layers=[400, 300, 200, 100])
        n_actions = self.env.action_space.shape[-1]
        action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                         sigma=0.1 * np.ones(n_actions))

        # check_env(self.env)
        model = TD3(MlpPolicy,
                    self.env,
                    policy_kwargs=policy_kwargs,
                    action_noise=action_noise,
                    memory_limit=50000,
                    tensorboard_log=
                    "/home/dfki.uni-bremen.de/mpatil/Documents/baselines_log",
                    verbose=1)

        time_steps = 3e4
        model.learn(total_timesteps=int(time_steps),
                    log_interval=50,
                    tb_log_name="td3_Docker_" + self.expt_name)
        model.save(
            "/home/dfki.uni-bremen.de/mpatil/Documents/td3_stable_baselines_" +
            self.expt_name)

        print("Closing environment")
        self.env.close()
Esempio n. 7
0
def load_model(path: str, env, desc: str):
    """ Loads a model from a stable baseline checkpoint file into a memory representation 

    Args:
        path        (str)           :       Path to the Stable Baseline Checkpoint File 
        env         (SB Env)        :       Path to the Stable Baseline Checkpoint File 
        desc        (str)           :       Text Description of what model this is

    Returns:
        The loaded model
    """

    if desc == "ddpg":
        return DDPG.load(path, env)
    elif desc == "ppo":
        env = DummyVecEnv([lambda: env])
        return PPO2.load(path, env)
    elif desc == "trpo":
        env = DummyVecEnv([lambda: env])
        return TRPO.load(path, env)
    elif desc == "td3":
        return TD3.load(path, env)
    elif desc == "sac":
        return SAC.load(path, env)
    else:
        raise RuntimeError(f"Model Name {desc} not supported")
Esempio n. 8
0
def run_stable(num_steps, save_dir):
    env = make_vec_env(BBall3Env,
                       n_envs=1,
                       monitor_dir=save_dir,
                       env_kwargs=env_config)
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=0.5 * np.ones(n_actions))

    model = TD3(
        MlpPolicy,
        env,
        action_noise=action_noise,
        verbose=1,
        gamma=0.99,
        buffer_size=1000000,
        learning_starts=10000,
        batch_size=100,
        learning_rate=1e-3,
        train_freq=1000,
        gradient_steps=1000,
        policy_kwargs={"layers": [64, 64]},
        n_cpu_tf_sess=1,
    )

    num_epochs = 1
    total_steps = 5e5

    for epoch in range(num_epochs):
        model.learn(total_timesteps=int(total_steps / num_epochs))
        model.save(save_dir + "/model.zip")
    def deploy_trained_model(self):
        # Load neural network policy
        from stable_baselines3 import TD3
        src_file = os.path.split(
            os.path.split(
                os.path.join(os.path.dirname(
                    os.path.realpath(__file__))))[0])[0]

        try:
            model = TD3.load(
                os.path.join(src_file,
                             "algos/SB/agents/QUAD_TD3_OPTUNA_policy"))
        except:
            model = None
            print("Failed to load nn. ")

        obs = self.reset()
        while True:
            velocity_target = self.get_input_target()

            if self.config["controller_source"] == "nn":
                if model == None:
                    act = np.random.rand(self.act_dim) * 2 - 1
                else:
                    act, _states = model.predict(obs, deterministic=True)
            else:
                act = self.calculate_stabilization_action(
                    obs[3:7], obs[10:13], velocity_target)
            obs, r, done, _ = self.step(act)
            if done: obs = self.reset()
Esempio n. 10
0
def train_TD3(env_train,
              model_name,
              model=None,
              timesteps=30000,
              save_path=None):
    """TD3 model"""
    # add the noise objects for TD3
    n_actions = env_train.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=0.1 * np.ones(n_actions))

    start = time.time()

    if model is None:
        model = TD3('MlpPolicy', env_train, action_noise=action_noise)
    else:
        model.set_env(env_train)
        model.verbose = config.VERBOSE

    model.learn(total_timesteps=timesteps)
    end = time.time()

    if save_path is None:
        save_path = f"{config.TRAINED_MODEL_DIR}/{model_name}"
    model.save(save_path)
    print('Training time (TD3): ', (end - start) / 60, ' minutes')
    return model
Esempio n. 11
0
    def train_TD3(self, model_name, model_params=config.TD3_PARAMS):
        """TD3 model"""
        from stable_baselines import TD3
        from stable_baselines.common.noise import NormalActionNoise

        env_train = self.env

        n_actions = env_train.action_space.shape[-1]
        action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                         sigma=0.1 * np.ones(n_actions))

        start = time.time()
        model = TD3('MlpPolicy',
                    env_train,
                    batch_size=model_params['batch_size'],
                    buffer_size=model_params['buffer_size'],
                    learning_rate=model_params['learning_rate'],
                    action_noise=action_noise,
                    verbose=model_params['verbose'])
        model.learn(total_timesteps=model_params['timesteps'])
        end = time.time()

        model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
        print('Training time (DDPG): ', (end - start) / 60, ' minutes')
        return model
Esempio n. 12
0
def test_TD3( env, out_dir, seed=None, **kwargs):

  model = TD3.load(os.path.join(out_dir,'final_model.pkl'), env=env)

  #model.learn(total_timesteps=10000)
  # Evaluate the trained agent
  mean_reward = evaluate(env, model, num_steps=5000)

  return
Esempio n. 13
0
def test_td3():
    model = TD3('MlpPolicy',
                'Pendulum-v0',
                policy_kwargs=dict(net_arch=[64, 64]),
                seed=0,
                learning_starts=100,
                verbose=1,
                create_eval_env=True,
                action_noise=action_noise)
    model.learn(total_timesteps=10000, eval_freq=5000)
Esempio n. 14
0
def train_TD3(env_train, model_name, timesteps=50000):
    """TD3 model"""

    start = time.time()
    model = TD3('MlpPolicy', env_train)
    model.learn(total_timesteps=timesteps, log_interval=10)
    end = time.time()

    model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
    print('Training time (DDPG): ', (end - start) / 60, ' minutes')
    return model
Esempio n. 15
0
def read_model(model_type):

    if model_type == "A2C":
        model = A2C.load(
            "./model_saved/Selected/A2C_ModelMar-05-2021_0815/A2C_ModelMar-05-2021_0815"
        )
    if model_type == "TD3":
        model = TD3.load(
            "./model_saved/Selected/TD3_ModelMar-05-2021_1442/TD3_ModelMar-05-2021_1442"
        )

    return model
Esempio n. 16
0
def load_model(config):
    model = None
    if config["algo_name"] == "TD3":
        model = TD3.load("agents/{}".format(args["test_agent_path"]))
    if config["algo_name"] == "A2C":
        model = A2C.load("agents/{}".format(args["test_agent_path"]))
    if config["algo_name"] == "SAC":
        model = SAC.load("agents/{}".format(args["test_agent_path"]))
    if config["algo_name"] == "PPO2":
        model = PPO2.load("agents/{}".format(args["test_agent_path"]))
    assert model is not None, "Alg name not found, cannot load model, exiting. "
    return model
Esempio n. 17
0
    def f_checkpoints_range_2_mean_performance(
            self, checkpoints: range) -> Tuple[np.ndarray, np.ndarray]:
        logging.debug(
            f"[f_checkpoints_range_2_mean_performance]: checkpoints={checkpoints}"
        )
        rewards = np.zeros(len(checkpoints))
        s_rates = np.zeros(len(checkpoints))
        # Intent
        # - Iterate over this range, to load the associated Stable Baseline Model Checkpoint
        # - Pass that model to `mean_eval` evaluation function which will evaluate the model on
        #   - a certain number of episodes
        #   - a certain env
        #    - continuous or not continuous space
        # - an evaluation returns reward and average success rate
        #
        # Evaluating N checkpoints on M queries and then averaging on M so to finally have N Rewards and N Success Rates

        j = 0
        """ NOTE: i can range in anyway while j iterates over the numpy array 
        """
        for i in checkpoints:
            path = f"{self.args.training_base_path}/models/quadcopter-{i}{self.args.suffix}"
            logging.debug(f"Evaluating model at {path}")
            if self.args.model['name'] == "ddpg":
                model = DDPG.load(path)
            elif self.args.model['name'] == "ppo":
                model = PPO2.load(path)
            elif self.args.model['name'] == "trpo":
                model = TRPO.load(path)
            elif self.args.model['name'] == "td3":
                model = TD3.load(path)
            elif self.args.model['name'] == "sac":
                model = SAC.load(path)
            logging.debug(
                f"Evaluating Model {self.args.model['name']} for {self.args.n_episodes} episodes in {self.args.env} environment with continuous={str(self.args.continuous)}"
            )
            rewards_list, success_rates_list = mean_eval(
                num_episodes=self.args.n_episodes,
                checkpoint_id=i,
                model=model,
                env=self.env,
                v=True,
                continuous=self.args.continuous,
                plots_dir=self.args.plots_dir)
            rewards_mean = np.mean(rewards_list)
            success_rates_mean = np.mean(success_rates_list)
            logging.debug(
                f"Evaluation Checkpoint={i} --> Average Reward = {rewards_mean}, Average Success Rate = {success_rates_mean}"
            )
            rewards[j] = rewards_mean
            s_rates[j] = success_rates_mean
            j += 1
        return rewards, s_rates
Esempio n. 18
0
def td3(env, seed):
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.1) *
                                                np.ones(n_actions))

    return TD3('MlpPolicy',
               env,
               learning_rate=0.001,
               action_noise=action_noise,
               verbose=1,
               tensorboard_log="./data/runs",
               seed=seed)
Esempio n. 19
0
 def explore(app, emulator, appium, timesteps, timer, save_policy, policy_dir,
             cycle, train_freq=10, random_exploration=0.8):
     try:
         env = TimeFeatureWrapper(app)
         model = TD3(MlpPolicy, env, verbose=1, train_freq=train_freq, random_exploration=random_exploration)
         callback = TimerCallback(timer=timer)
         model.learn(total_timesteps=timesteps, callback=callback)
         if save_policy:
             model.save(f'{policy_dir}{os.sep}{cycle}')
         return True
     except Exception:
         appium.restart_appium()
         if emulator is not None:
             emulator.restart_emulator()
         return False
Esempio n. 20
0
    def __call__(self, trial):
        # Calculate an objective value by using the extra arguments.
        env_id = 'gym_custom:fooCont-v0'
        env = gym.make(env_id, data=self.train_data)
        env = DummyVecEnv([lambda: env])
        algo = trial.suggest_categorical('algo', ['TD3'])
        model = 0
        if algo == 'PPO2':

            policy_choice = trial.suggest_categorical('policy', [False, True])
            policy = commonMlp if policy_choice else commonMlpLstm
            model_params = optimize_ppo2(trial)

            model = PPO2(policy, env, verbose=0, nminibatches=1, **model_params)
            model.learn(276*7000)

        elif algo == 'DDPG':
            policy_choice = trial.suggest_categorical('policy', [False, True])
            policy = ddpgLnMlp
            model_params = sample_ddpg_params(trial)

            model= DDPG(policy, env, verbose=0, **model_params)
            model.learn(276*7000)

        elif algo == 'TD3':
            policy_choice = trial.suggest_categorical('policy', [False, True])
            policy = td3MLP if policy_choice else td3LnMlp
            model_params = sample_td3_params(trial)

            model = TD3(policy, env, verbose=0, **model_params)
            model.learn(276*7000*3)

        rewards = []
        reward_sum = 0.0
        env = gym.make(env_id, data=self.test_data)
        env = DummyVecEnv([lambda: env])

        obs = env.reset()
        for ep in range(1000):
            for step in range(276):
                action, _ = model.predict(obs)
                obs, reward, done, _ = env.step(action)
                reward_sum += reward

                if done:
                   rewards.append(reward_sum)
                    reward_sum = 0.0
                    obs = env.reset()
Esempio n. 21
0
def do_rollout_stable(init_point=None):
    env = gym.make(env_name, **config)
    td3_model = TD3.load(
        script_path +
        "../rl-baselines-zoo/baseline_log2/td3/su_acrobot_cdc-v0_2/su_acrobot_cdc-v0.zip"
    )

    if init_point is not None:
        obs = env.reset(init_point)
    else:
        obs = env.reset()

    obs = torch.as_tensor(obs, dtype=torch.float32)

    acts_list = []
    obs1_list = []
    rews_list = []

    dtype = torch.float32
    act_size = env.action_space.shape[0]
    obs_size = env.observation_space.shape[0]

    done = False
    cur_step = 0

    while not done:
        acts = td3_model.predict(obs.reshape(-1, obs_size))[0]

        for _ in range(20):
            obs, rew, done, out = env.step(acts)

        # env.render()
        obs1_list.append(obs)
        obs = torch.as_tensor(obs, dtype=dtype)

        acts_list.append(torch.as_tensor(acts))
        rews_list.append(torch.as_tensor(rew, dtype=dtype))
        cur_step += 1

    ep_obs1 = torch.tensor(obs1_list).reshape(-1, 4)
    ep_acts = torch.stack(acts_list).reshape(-1, act_size)
    ep_rews = torch.stack(rews_list).reshape(-1, 1)

    return ep_obs1, ep_acts, ep_rews, None, ep_obs1
def test_deterministic_td3():
    results = [[], []]
    rewards = [[], []]
    kwargs = {'n_cpu_tf_sess': 1}
    env_id = 'Pendulum-v0'
    kwargs.update({'action_noise': NormalActionNoise(0.0, 0.1)})

    for i in range(2):
        model = TD3('MlpPolicy', env_id, seed=SEED, **kwargs)
        model.learn(N_STEPS_TRAINING)
        env = model.get_env()
        obs = env.reset()
        for _ in range(20):
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, _, _ = env.step(action)
            results[i].append(action)
            rewards[i].append(reward)
    # without the extended tolerance, test fails for unknown reasons on Github...
    assert np.allclose(results[0], results[1], rtol=1e-2), results
    assert np.allclose(rewards[0], rewards[1], rtol=1e-2), rewards
Esempio n. 23
0
def optimize_agent(trial):
    """ Train the model and optimise
        Optuna maximises the negative log likelihood, so we
        need to negate the reward here
    """
    model_params = optimize_TD3(trial)
    env = SubprocVecEnv([
        lambda: NormalizeActionWrapper(LearningRocket(visualize=False))
        for i in range(n_cpu)
    ])

    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=0.1 * np.ones(n_actions))

    model = TD3(MlpPolicy,
                env,
                action_noise=action_noise,
                policy_kwargs=dict(layers=[400, 300]))
    model.learn(50000)

    rewards = []
    n_episodes, reward_sum = 0, 0.0

    obs = env.reset()
    step = 0
    while n_episodes < 4:
        step += 1
        action, _ = model.predict(obs)
        obs, reward, done, _ = env.step(action)
        reward_sum += reward
        if done:
            rewards.append(reward_sum)
            reward_sum = 0.0
            n_episodes += 1
            obs = env.reset()

    last_reward = np.mean(rewards)
    trial.report(-1 * last_reward, step)

    return -1 * last_reward
Esempio n. 24
0
def train_TD3(env_train, model_name, timesteps=100000):

    # train TD3 model
    os.chdir("./model_saved/")
    start = time.time()
    print("Train TD3 Model with MlpPolicy: ")

    model = TD3('MlpPolicy', env_train, verbose=0)
    print("TD3 Learning time steps: ", timesteps)

    model.learn(total_timesteps=timesteps)
    print("TD3 Model learning completed: ")
    end = time.time()
    timestamp = time.strftime('%b-%d-%Y_%H%M')
    model_file_name = (model_name + timestamp)
    model.save(model_file_name)
    print("TD3 Model save finish     :")
    print('Training time TD3: ', (end - start) / 60, ' minutes')

    os.chdir("./..")

    return model
Esempio n. 25
0
def td3(env_id,
        timesteps,
        policy="MlpPolicy",
        log_interval=None,
        tensorboard_log=None,
        seed=None):
    from stable_baselines.ddpg.noise import NormalActionNoise
    env = gym.make(env_id)

    # The noise objects for TD3
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=0.1 * np.ones(n_actions))

    model = TD3(policy,
                env,
                action_noise=action_noise,
                verbose=1,
                tensorboard_log=tensorboard_log)
    model.learn(total_timesteps=timesteps, log_interval=log_interval)

    save_model_weights(model, "td3", env_id, policy, seed)
Esempio n. 26
0
arg.LR_STOP = 0.1
arg.lr_gamma = 0.95
arg.PI_STD=1
arg.goal_radius_range=[0.1,0.3]
arg.TERMINAL_VEL = 0.025
arg.goal_radius_range=[0.15,0.3]
arg.std_range = [0.02,0.3,0.02,0.3]
arg.TERMINAL_VEL = 0.025  # terminal velocity? # norm(action) that you believe as a signal to stop 0.1.
arg.DELTA_T=0.2
arg.EPISODE_LEN=35

number_updates=100

# agent convert to torch model
import policy_torch
baselines_mlp_model =TD3.load('trained_agent/accac_final_1000000_9_11_20_25.zip')
agent = policy_torch.copy_mlp_weights(baselines_mlp_model,layers=[512,512],n_inputs=32)

# loading enviorment, same as training
env=firefly_accac.FireflyAccAc(arg)
# ---seting the env for inverse----
# TODO, move it to a function of env
env.agent_knows_phi=False


for i in range(10):
    filename=(str(time.localtime().tm_mday)+'_'+str(time.localtime().tm_hour)+'_'+str(time.localtime().tm_min))
    single_theta_inverse(arg, env, agent, filename, 
                    number_updates=number_updates,
                    true_theta=None, 
                    phi=None,
Esempio n. 27
0
        stepIdx, currIt = 0, 0

        try:

            # model = PPO2.load(f'rsu_agents/{scenario_name}_agents/'
            #                   f'PPO2_ns3_online_{scenario_name}_cars={num_of_vehicles}')

            # model = PPO2.load(
            #     (f'rsu_agents/single_lane_highway_agents/optimized_interval/PPO2_ns3_single_lane_highway_cars=25_optimized'))

            # model = SAC.load(
            #     (f'rsu_agents/single_lane_highway_agents/optimized_interval/SAC_ns3_single_lane_highway_cars=25_optimized'))

            model = TD3.load(
                f'rsu_agents/single_lane_highway_agents/optimized_interval/TD3_ns3_single_lane_highway_cars=25_optimized'
            )

            # model = PPO2.load(
            #     (f'rsu_agents/square_agents/optimized_interval/PPO2_ns3_square_cars=25_optimized'))

            # model = SAC.load(
            #     (f'rsu_agents/square_agents/optimized_interval/SAC_ns3_square_cars=25_optimized'))

            # model = TD3.load(
            #     f'rsu_agents/square_agents/optimized_interval/TD3_ns3_square_cars=25_optimized')

            while True:
                print("Start iteration: ", currIt)
                obs = env.reset()
                reward = 0
def train_initial_policy(
        model_name,
        algo=ALGO,
        env_name=ENV_NAME,
        time_steps=TIME_STEPS):
    """Uses the specified algorithm on the target environment"""
    print("Using algorithm : ", algo.__name__)
    print("Model saved as : ", "data/models/" +algo.__name__+"_initial_policy_"+env_name+"_.pkl")

    # define the environment here
    env = gym.make(env_name)
    env.seed(SEED)
    if NOISE_VALUE>0 : env = NoisyRealEnv(env, noise_value=NOISE_VALUE)

    if MUJOCO_NORMALIZE:
        env = MujocoNormalized(env)

    print('~~ ENV Obs RANGE : ', env.observation_space.low, env.observation_space.high)
    print('~~~ ENV Action RANGE : ', env.action_space.low, env.action_space.high)

    if algo.__name__  == "ACKTR":
        print('Using SubprovVecEnv')
        env = SubprocVecEnv([lambda: env for i in range(8)])
    elif algo.__name__ == "SAC":
        print('Using standard gym environment')
        env = env
    else:
        print('Using Dummy Vec Env')
        env = DummyVecEnv([lambda : env])

    if NORMALIZE :
        env = VecNormalize(env,
                           training=True,
                           norm_obs=True,
                           norm_reward=False,
                           clip_reward=1e6,
                           )


    with open('data/target_policy_params.yaml') as file:
        args = yaml.load(file, Loader=yaml.FullLoader)
    args = args[algo.__name__][PARAMS_ENV]
    print('~~ Loaded args file ~~')

    if algo.__name__ == "SAC":
        print('Initializing SAC with RLBaselinesZoo hyperparameters .. ')
        print('using 256 node architecture as in the paper')

        class CustomPolicy(ffp_sac):
            def __init__(self, *args, **kwargs):
                super(CustomPolicy, self).__init__(*args, **kwargs,
                                                   feature_extraction="mlp", layers=[256, 256])

        model = SAC(CustomPolicy, env,
                    verbose=1,
                    tensorboard_log='data/TBlogs/initial_policy_training',
                    batch_size=args['batch_size'],
                    buffer_size=args['buffer_size'],
                    ent_coef=args['ent_coef'],
                    learning_starts=args['learning_starts'],
                    learning_rate=args['learning_rate'],
                    train_freq=args['train_freq'],
                    seed=SEED,
                    )
    elif algo.__name__ == "TD3":
        print('Initializing TD3 with RLBaselinesZoo hyperparameters .. ')
        # hyperparameters suggestions from :
        # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/td3/HopperBulletEnv-v0/config.yml
        n_actions = env.action_space.shape[-1]
        action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                         sigma=float(args['noise_std']) * np.ones(n_actions))
        class CustomPolicy2(ffp_td3):
            def __init__(self, *args, **kwargs):
                super(CustomPolicy2, self).__init__(*args, **kwargs,
                                                   feature_extraction="mlp", layers=[400, 300])
        model = TD3(CustomPolicy2, env,
                    verbose = 1,
                    tensorboard_log = 'data/TBlogs/initial_policy_training',
                    batch_size = args['batch_size'],
                    buffer_size = args['buffer_size'],
                    gamma = args['gamma'],
                    gradient_steps = args['gradient_steps'],
                    learning_rate = args['learning_rate'],
                    learning_starts = args['learning_starts'],
                    action_noise = action_noise,
                    train_freq=args['train_freq'],
                    seed=SEED,
                    )

    elif algo.__name__ == "TRPO":
        print('Initializing TRPO with RLBaselinesZoo hyperparameters .. ')
        # hyperparameters suggestions from :
        # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/sac/HopperBulletEnv-v0/config.yml
        model = TRPO(mlp_standard, env,
                    verbose=1,
                    tensorboard_log='data/TBlogs/initial_policy_training',
                    timesteps_per_batch=args['timesteps_per_batch'],
                    lam=args['lam'],
                    max_kl=args['max_kl'],
                    gamma=args['gamma'],
                    vf_iters=args['vf_iters'],
                    vf_stepsize=args['vf_stepsize'],
                    entcoeff=args['entcoeff'],
                    cg_damping=args['cg_damping'],
                    cg_iters=args['cg_iters'],
                     seed=SEED,
                    )

    elif algo.__name__ == "ACKTR":
        print('Initializing ACKTR')
        model = ACKTR(mlp_standard,
                      env,
                      verbose=1,
                      n_steps=128,
                      ent_coef=0.01,
                      lr_schedule='constant',
                      learning_rate=0.0217,
                      max_grad_norm=0.5,
                      gamma=0.99,
                      vf_coef=0.946,
                      seed=SEED)

    elif algo.__name__ == "PPO2":
        print('Initializing PPO2')
        print('Num envs : ', env.num_envs)
        model = PPO2(mlp_standard,
                     env,
                     n_steps=int(args['n_steps']/env.num_envs),
                     nminibatches=args['nminibatches'],
                     lam=args['lam'],
                     gamma=args['gamma'],
                     ent_coef=args['ent_coef'],
                     noptepochs=args['noptepochs'],
                     learning_rate=args['learning_rate'],
                     cliprange=args['cliprange'],
                     verbose=1,
                     tensorboard_log='data/TBlogs/initial_policy_training',
                     seed=SEED,
                     )

    else:
        print('No algorithm matched. Using SAC .. ')
        model = SAC(CustomPolicy, env,
                    verbose=1,
                    batch_size=args['batch_size'],
                    buffer_size=args['buffer_size'],
                    ent_coef=args['ent_coef'],
                    learning_starts=args['learning_starts'],
                    learning_rate=args['learning_rate'],
                    train_freq=args['train_freq'],
                    seed=SEED,
                    )

    # change model name if using normalization
    if NORMALIZE:
        model_name = model_name.replace('.pkl', 'normalized_.pkl')

    elif MUJOCO_NORMALIZE:
        model_name = model_name.replace('.pkl', 'mujoco_norm_.pkl')

    if SAVE_BEST_FOR_20:
        model.learn(total_timesteps=time_steps,
                    tb_log_name=model_name,
                    log_interval=10,
                    callback=eval_callback)
        save_the_model()
        model_name = model_name.replace('best_', '')
        model.save(model_name)
    elif SAVE_INTERMEDIATE:
        check_callback = CheckpointCallback(save_freq=SAVE_FREQ,
                                            save_path=model_name[:-4],
                                            name_prefix=ENV_NAME + '_' + str(SEED),
                                            verbose=1,
                                            )
        eval_env = DummyVecEnv([lambda: gym.make(ENV_NAME)])
        eval_env.seed(SEED)
        eval_callback = EvalCallback(eval_env,
                                     n_eval_episodes=10,
                                     eval_freq=SAVE_FREQ,
                                     log_path=model_name[:-4],
                                     deterministic=False,
                                     render=False,
                                     verbose=1)

        callbacks = CallbackList([check_callback, eval_callback])
        model.learn(total_timesteps=time_steps,
                    tb_log_name=model_name.split('/')[-1],
                    log_interval=10,
                    callback=callbacks)
        model.save(model_name)
        npzfile = np.load(model_name[:-4] + '/evaluations.npz')
        average_rewards = np.mean(npzfile['results'], axis=1)[:, 0]
        with open(model_name[:-4] + "/eval_results.txt", "a") as f:
            for i in range(np.shape(average_rewards)[0]):
                f.write("{}, {}\n".format(npzfile['timesteps'][i], average_rewards[i]))
        evaluate_policy_on_env(env, model, render=False, iters=50)
    else:
        model.learn(total_timesteps=time_steps,
                    tb_log_name=model_name.split('/')[-1],
                    log_interval=10,)
        model.save(model_name)
        evaluate_policy_on_env(env, model, render=False, iters=50)

    # save the environment params
    if NORMALIZE:
        # env.save(model_name.replace('.pkl', 'stats_.pkl'))
        env.save('data/models/env_stats/'+env_name+'.pkl')

    print('done :: ', model_name)
    exit()
Esempio n. 29
0
arg.NUM_SAMPLES = 2
arg.NUM_EP = 200
arg.NUM_IT = 2  # number of iteration for gradient descent
arg.NUM_thetas = 1
arg.ADAM_LR = 0.1
arg.LR_STEP = 2
arg.LR_STOP = 0.003
arg.lr_gamma = 0.95
arg.PI_STD = 1
arg.goal_radius_range = [0.1, 0.3]
arg.TERMINAL_VEL = 0.025
number_updates = 100

# agent convert to torch model
import policy_torch
baselines_mlp_model = TD3.load(
    'trained_agent//acc_retrain_1000000_2_18_21_4.zip')
agent = policy_torch.copy_mlp_weights(baselines_mlp_model,
                                      layers=[128, 128],
                                      n_inputs=30)

# loading enviorment, same as training
env = firefly_acc.FireflyAcc(arg)
# ---seting the env for inverse----
# TODO, move it to a function of env
env.agent_knows_phi = False

for i in range(10):
    filename = ("test_acc_EP" + str(arg.NUM_EP) + "updates" +
                str(number_updates) + "lr" + str(arg.ADAM_LR) + 'step' +
                str(arg.LR_STEP) + str(time.localtime().tm_mday) + '_' +
                str(time.localtime().tm_hour) + '_' +
Esempio n. 30
0

class CustomTD3Policy(FeedForwardPolicy):
    def __init__(self, *args, **kwargs):
        super(CustomTD3Policy, self).__init__(*args,
                                              **kwargs,
                                              layers=[400, 400],
                                              layer_norm=True,
                                              feature_extraction="mlp")


model = TD3(CustomTD3Policy,
            env,
            verbose=1,
            action_noise=action_noise,
            learning_rate=0.001,
            gamma=0.99,
            buffer_size=1000000,
            batch_size=100,
            train_freq=1000,
            tensorboard_log="./gait2d_td3_tensorboard/")

if args.train:
    model.learn(total_timesteps=args.steps, callback=eval_callback)
    model.save(args.model)
else:
    model = TD3.load(args.model, env=env)
    obs = env.reset()
    while True:
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        if args.visualize: