Exemple #1
0
    def train_TD3(self, model_name, model_params=config.TD3_PARAMS):
        """TD3 model"""
        from stable_baselines import TD3
        from stable_baselines.common.noise import NormalActionNoise

        env_train = self.env

        n_actions = env_train.action_space.shape[-1]
        action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                         sigma=0.1 * np.ones(n_actions))

        start = time.time()
        model = TD3('MlpPolicy',
                    env_train,
                    batch_size=model_params['batch_size'],
                    buffer_size=model_params['buffer_size'],
                    learning_rate=model_params['learning_rate'],
                    action_noise=action_noise,
                    verbose=model_params['verbose'])
        model.learn(total_timesteps=model_params['timesteps'])
        end = time.time()

        model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
        print('Training time (DDPG): ', (end - start) / 60, ' minutes')
        return model
    def __call__(self):

        policy_kwargs = dict(layers=[400, 300, 200, 100])
        n_actions = self.env.action_space.shape[-1]
        action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                         sigma=0.1 * np.ones(n_actions))

        # check_env(self.env)
        model = TD3(MlpPolicy,
                    self.env,
                    policy_kwargs=policy_kwargs,
                    action_noise=action_noise,
                    memory_limit=50000,
                    tensorboard_log=
                    "/home/dfki.uni-bremen.de/mpatil/Documents/baselines_log",
                    verbose=1)

        time_steps = 3e4
        model.learn(total_timesteps=int(time_steps),
                    log_interval=50,
                    tb_log_name="td3_Docker_" + self.expt_name)
        model.save(
            "/home/dfki.uni-bremen.de/mpatil/Documents/td3_stable_baselines_" +
            self.expt_name)

        print("Closing environment")
        self.env.close()
Exemple #3
0
def train_TD3(env_train,
              model_name,
              model=None,
              timesteps=30000,
              save_path=None):
    """TD3 model"""
    # add the noise objects for TD3
    n_actions = env_train.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=0.1 * np.ones(n_actions))

    start = time.time()

    if model is None:
        model = TD3('MlpPolicy', env_train, action_noise=action_noise)
    else:
        model.set_env(env_train)
        model.verbose = config.VERBOSE

    model.learn(total_timesteps=timesteps)
    end = time.time()

    if save_path is None:
        save_path = f"{config.TRAINED_MODEL_DIR}/{model_name}"
    model.save(save_path)
    print('Training time (TD3): ', (end - start) / 60, ' minutes')
    return model
def sample_td3_params(trial):
    """
    Sampler for TD3 hyperparams.

    :param trial: (optuna.trial)
    :return: (dict)
    """
    gamma = trial.suggest_categorical('gamma', [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    learning_rate = trial.suggest_loguniform('lr', 1e-5, 1)
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 100, 128, 256, 512])
    buffer_size = trial.suggest_categorical('buffer_size', [int(1e4), int(1e5), int(1e6)])
    train_freq = trial.suggest_categorical('train_freq', [1, 10, 100, 1000, 2000])
    gradient_steps = train_freq
    noise_type = trial.suggest_categorical('noise_type', ['ornstein-uhlenbeck', 'normal'])
    noise_std = trial.suggest_uniform('noise_std', 0, 1)

    hyperparams = {
        'gamma': gamma,
        'learning_rate': learning_rate,
        'batch_size': batch_size,
        'buffer_size': buffer_size,
        'train_freq': train_freq,
        'gradient_steps': gradient_steps,
    }

    if noise_type == 'normal':
        hyperparams['action_noise'] = NormalActionNoise(mean=np.zeros(trial.n_actions),
                                                        sigma=noise_std * np.ones(trial.n_actions))
    elif noise_type == 'ornstein-uhlenbeck':
        hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise(mean=np.zeros(trial.n_actions),
                                                                   sigma=noise_std * np.ones(trial.n_actions))

    return hyperparams
Exemple #5
0
	def _on_step(self) -> bool:
		"""
		This method will be called by the model after each call to `env.step()`.

		For child callback (of an `EventCallback`), this will be called
		when the event is triggered.

		:return: (bool) If the callback returns False, training is aborted early.
		"""
		global SIGMA, LEARNING_RATE
		if (self.num_timesteps % 1000) ==0:
			# import pdb; pdb.set_trace()
			t = time.time()
			time_elapsed = t-self.startTime #seconds
			self.model.save("td3_model_int_test")
			SIGMA = SIGMA*.9
			# LEARNING_RATE = LEARNING_RATE*.9
			print("---------" + str(self.num_timesteps) +" steps complete | SIGMA = " + str(SIGMA) + " | Learning Rate: " + str(LEARNING_RATE) + "|----------")
			print("---------------Time Elapsed: " + str(time_elapsed) + " seconds")
			f = open(os.path.join(dirName, "learn.txt"), "a")
			f.write("---------" + str(self.num_timesteps) +" steps complete | SIGMA = " + str(SIGMA) + " | Learning Rate: " + str(LEARNING_RATE) + "|----------\n")
			f.write("--------- Time Elapsed: " + str(time_elapsed) + " seconds -----------\n")
			f.close()
			
			self.model.action_noise = NormalActionNoise(0,SIGMA) #annealed noise
			# self.model.learning_rate =  LEARNING_RATE
			# td3_noise = OrnsteinUhlenbeckActionNoise(np.zeros(a_dim), sigma*np.ones(a_dim)) 
			

		print("\t--Step Done      --\t|")
		if yPos_global > 200:
			input("Please reset the robot to start and press enter key to continue..")

		return True
Exemple #6
0
def sample_ddpg_params(trial):
    """
    Sampler for DDPG hyperparams.
    :param trial: (optuna.trial)
    :return: (dict)
    """
    gamma = trial.suggest_categorical('gamma', [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    # actor_lr = trial.suggest_loguniform('actor_lr', 1e-5, 1)
    # critic_lr = trial.suggest_loguniform('critic_lr', 1e-5, 1)
    learning_rate = trial.suggest_loguniform('lr', 1e-5, 1)
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 100, 128, 256, 512])
    buffer_size = trial.suggest_categorical('memory_limit', [int(1e4), int(1e5), int(1e6)])
    noise_type = trial.suggest_categorical('noise_type', ['ornstein-uhlenbeck', 'normal'])
    noise_std = trial.suggest_uniform('noise_std', 0, 1)
    normalize_observations = trial.suggest_categorical('normalize_observations', [True, False])
    normalize_returns = trial.suggest_categorical('normalize_returns', [True, False])

    hyperparams = {
        'gamma': gamma,
        'actor_lr': learning_rate,
        'critic_lr': learning_rate,
        'batch_size': batch_size,
        'memory_limit': buffer_size,
        'normalize_observations': normalize_observations,
        'normalize_returns': normalize_returns
    }


    if noise_type == 'normal':
        hyperparams['action_noise'] = NormalActionNoise(mean=np.zeros(1),
                                                        sigma=noise_std * np.ones(1))
    elif noise_type == 'ornstein-uhlenbeck':
        hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise(mean=np.zeros(1),
                                                                   sigma=noise_std * np.ones(1))
    return hyperparams
def sample_ddpg_params(trial):
    """
    Sampler for DDPG hyperparams.

    :param trial: (optuna.trial)
    :return: (dict)
    """
    gamma = trial.suggest_categorical(
        'gamma', [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    # actor_lr = trial.suggest_loguniform('actor_lr', 1e-5, 1)
    # critic_lr = trial.suggest_loguniform('critic_lr', 1e-5, 1)
    learning_rate = trial.suggest_loguniform('lr', 1e-5, 1)
    batch_size = trial.suggest_categorical('batch_size',
                                           [16, 32, 64, 128, 256])
    buffer_size = trial.suggest_categorical(
        'memory_limit', [int(1e4), int(1e5), int(1e6)])
    noise_type = trial.suggest_categorical(
        'noise_type', ['ornstein-uhlenbeck', 'normal', 'adaptive-param'])
    noise_std = trial.suggest_uniform('noise_std', 0, 1)
    normalize_observations = trial.suggest_categorical(
        'normalize_observations', [True, False])
    normalize_returns = trial.suggest_categorical('normalize_returns',
                                                  [True, False])

    hyperparams = {
        'gamma': gamma,
        'actor_lr': learning_rate,
        'critic_lr': learning_rate,
        'batch_size': batch_size,
        'memory_limit': buffer_size,
        'normalize_observations': normalize_observations,
        'normalize_returns': normalize_returns
    }

    if noise_type == 'adaptive-param':
        hyperparams['param_noise'] = AdaptiveParamNoiseSpec(
            initial_stddev=noise_std, desired_action_stddev=noise_std)
        # Apply layer normalization when using parameter perturbation
        hyperparams['policy_kwargs'] = dict(layer_norm=True)
    elif noise_type == 'normal':
        hyperparams['action_noise'] = NormalActionNoise(
            mean=np.zeros(trial.n_actions),
            sigma=noise_std * np.ones(trial.n_actions))
    elif noise_type == 'ornstein-uhlenbeck':
        hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise(
            mean=np.zeros(trial.n_actions),
            sigma=noise_std * np.ones(trial.n_actions))
    return hyperparams
Exemple #8
0
def start_unity_baselines():
    # Set to FALSE for CIP-Pool execution
    # env = make_unity_env('./envs/worm_dynamic_one_agent/linux/worm_dynamic', 1, False)
    # InitialTrainingExample.start_training(env)
    # env.close()

    unity_env = UnityEnvironment(
        './envs/worm_dynamic_one_agent/linux/worm_dynamic', no_graphics=True)
    env = UnityToGymWrapper(unity_env, uint8_visual=False)
    env = Monitor(env, 'results/')
    # The noise objects for TD3
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=0.1 * np.ones(n_actions))

    model = TD3_Baselines(MlpPolicy, env, action_noise=action_noise, verbose=1)
    model.learn(total_timesteps=int(2e6), log_interval=10)
    model.save("td3_worm")
def test_deterministic_td3():
    results = [[], []]
    rewards = [[], []]
    kwargs = {'n_cpu_tf_sess': 1}
    env_id = 'Pendulum-v0'
    kwargs.update({'action_noise': NormalActionNoise(0.0, 0.1)})

    for i in range(2):
        model = TD3('MlpPolicy', env_id, seed=SEED, **kwargs)
        model.learn(N_STEPS_TRAINING)
        env = model.get_env()
        obs = env.reset()
        for _ in range(20):
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, _, _ = env.step(action)
            results[i].append(action)
            rewards[i].append(reward)
    # without the extended tolerance, test fails for unknown reasons on Github...
    assert np.allclose(results[0], results[1], rtol=1e-2), results
    assert np.allclose(rewards[0], rewards[1], rtol=1e-2), rewards
Exemple #10
0
def optimize_agent(trial):
    """ Train the model and optimise
        Optuna maximises the negative log likelihood, so we
        need to negate the reward here
    """
    model_params = optimize_TD3(trial)
    env = SubprocVecEnv([
        lambda: NormalizeActionWrapper(LearningRocket(visualize=False))
        for i in range(n_cpu)
    ])

    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=0.1 * np.ones(n_actions))

    model = TD3(MlpPolicy,
                env,
                action_noise=action_noise,
                policy_kwargs=dict(layers=[400, 300]))
    model.learn(50000)

    rewards = []
    n_episodes, reward_sum = 0, 0.0

    obs = env.reset()
    step = 0
    while n_episodes < 4:
        step += 1
        action, _ = model.predict(obs)
        obs, reward, done, _ = env.step(action)
        reward_sum += reward
        if done:
            rewards.append(reward_sum)
            reward_sum = 0.0
            n_episodes += 1
            obs = env.reset()

    last_reward = np.mean(rewards)
    trial.report(-1 * last_reward, step)

    return -1 * last_reward
def DDPGgive_results(files, balance, shares=None):
    env = create_stock_env(files, train=False, balance=balance, shares=shares)
    max_steps = env.max_steps - env.num_prev
    env = DummyVecEnv([lambda: env])
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(0, 2)
    param_noise = AdaptiveParamNoiseSpec(initial_stddev=1,
                                         desired_action_stddev=0.1,
                                         adoption_coefficient=1.01)
    model = DDPG(CustomDDPGPolicy,
                 env,
                 verbose=0,
                 param_noise=param_noise,
                 action_noise=action_noise)

    # model = DDPG.load("/home/harshit/Documents/itsp-trade agent/Reinforcement-Learning-Stock-Trader/WebPortal/StockApp/Stock_stable.zip",env=env)
    model.learn(total_timesteps=100)
    profit = 0
    profitst = np.zeros((max_steps - 1, 2))
    actionst = np.zeros((n_actions // 2, max_steps - 1, 2))
    shares = np.zeros((len(files), max_steps - 1, 2))
    obs = env.reset()
    for i in range(max_steps):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        actionst[:, i, 1] = -info[0]['action'][0][0:n_actions // 2] + info[0][
            'action'][0][n_actions // 2:]
        actionst[:, i, 0] = i
        shares[:, i, 1] = info[0]['shares_held']
        shares[:, i, 0] = i
        #         print('a',action)
        profit += rewards
        profitst[i] = [i, profit]
        if dones:
            break
    print(info[0]['action'][0])
    print(actionst)
    return profitst.tolist(), shares.tolist(), actionst.tolist()
def test_deterministic_training_common(algo):
    results = [[], []]
    rewards = [[], []]
    kwargs = {'n_cpu_tf_sess': 1}
    if algo in [DDPG, TD3, SAC]:
        env_id = 'Pendulum-v0'
        kwargs.update({'action_noise': NormalActionNoise(0.0, 0.1)})
    else:
        env_id = 'CartPole-v1'
        if algo == DQN:
            kwargs.update({'learning_starts': 100})

    for i in range(2):
        model = algo('MlpPolicy', env_id, seed=SEED, **kwargs)
        model.learn(N_STEPS_TRAINING)
        env = model.get_env()
        obs = env.reset()
        for _ in range(100):
            action, _ = model.predict(obs, deterministic=False)
            obs, reward, _, _ = env.step(action)
            results[i].append(action)
            rewards[i].append(reward)
    assert sum(results[0]) == sum(results[1]), results
    assert sum(rewards[0]) == sum(rewards[1]), rewards
Exemple #13
0
from stable_baselines.td3.policies import MlpPolicy
from stable_baselines import TD3
from TD3_test import TD3_ff
from FireflyEnv import firefly_acc
from Config import Config
arg = Config()
import numpy as np
from numpy import pi
import time
from stable_baselines.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
from reward_functions import reward_singleff

action_noise = NormalActionNoise(mean=np.zeros(2),
                                 sigma=float(0.1) * np.ones(2))

arg.goal_radius_range = [0.15, 0.3]
arg.std_range = [0.02, 0.2, 0.02, 0.2]
arg.TERMINAL_VEL = 0.025  # terminal velocity? # norm(action) that you believe as a signal to stop 0.1.
arg.DELTA_T = 0.2
arg.EPISODE_LEN = 35
env = firefly_acc.FireflyAcc(arg)

modelname = None
# modelname='trained_agent/'+'TD_acc_control_retrain_1000000_1_5_6_57'

if modelname is None:  # new train
    model = TD3_ff(
        MlpPolicy,
        env,
        verbose=1,
        tensorboard_log="./Tensorboard/",
Exemple #14
0
    if args.save_freq > 0:
        callbacks.append(CheckpointCallback(save_freq=args.save_freq, save_path=save_path,
                                            name_prefix='rl_model'))

    algo = {
        'sac': SAC,
        'td3': TD3
    }[args.algo]

    n_actions = env.action_space.shape[0]

    # Tuned hyperparameters from https://github.com/araffin/rl-baselines-zoo
    hyperparams = {
        'sac': dict(batch_size=256, gamma=0.98, policy_kwargs=dict(layers=[256, 256]),
                    learning_starts=10000, buffer_size=int(2e5), tau=0.01),

        'td3': dict(batch_size=100, policy_kwargs=dict(layers=[400, 300]),
                    learning_rate=1e-3, learning_starts=10000, buffer_size=int(1e6),
                    train_freq=1000, gradient_steps=1000,
                    action_noise=NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)))
    }[args.algo]

    model = algo('MlpPolicy', env, verbose=1, **hyperparams)
    try:
        model.learn(n_timesteps, callback=callbacks)
    except KeyboardInterrupt:
        pass

    print("Saving to {}.zip".format(save_path))
    model.save(save_path)
Exemple #15
0
def run_process(study_name, alg_param, env_param, log_path='.'):
    study_path = os.path.join(log_path, study_name)
    make_sure_path_exists(study_path)
    trial_path, trial_id = generate_trial_path(study_path)
    make_sure_path_exists(trial_path)

    with open(trial_path + '/alg_param.pkl', "wb+") as outfile:
        pickle.dump(alg_param, outfile)

    with open(trial_path + '/env_param.pkl', "wb+") as outfile:
        pickle.dump(env_param, outfile)

    num_nodes = alg_param['num_nodes']
    num_layers = alg_param['num_layers']
    learning_rate = alg_param['learning_rate']
    alg = alg_param['alg']
    nenv = alg_param['nenv']
    env = build_env(trial_path, env_param, nenv=nenv)

    if alg == 'dqn':
        from stable_baselines.deepq.policies import MlpPolicy
        from stable_baselines import DQN
        call_iter = 1000
        policy_kwargs = dict(layers=[num_nodes for _ in range(num_layers)])
        model = DQN(MlpPolicy,
                    env,
                    verbose=1,
                    policy_kwargs=policy_kwargs,
                    tensorboard_log=trial_path)
    #DDPG calls back every step of every rollout
    elif alg == 'ddpg':
        from stable_baselines.ddpg.policies import MlpPolicy
        from stable_baselines.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise, AdaptiveParamNoiseSpec
        from stable_baselines import DDPG
        call_iter = 1000
        n_actions = env.action_space.shape[-1]
        param_noise = None
        action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                    sigma=float(0.5) *
                                                    np.ones(n_actions))
        policy_kwargs = dict(layers=[num_nodes for _ in range(num_layers)])
        model = DDPG(MlpPolicy,
                     env,
                     verbose=1,
                     param_noise=param_noise,
                     action_noise=action_noise,
                     policy_kwargs=policy_kwargs,
                     tensorboard_log=trial_path)

    elif alg == 'td3':
        from stable_baselines import TD3
        from stable_baselines.td3.policies import MlpPolicy
        from stable_baselines.common.vec_env import DummyVecEnv
        from stable_baselines.ddpg.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
        call_iter = 1000
        n_actions = env.action_space.shape[-1]
        action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                         sigma=0.1 * np.ones(n_actions))
        policy_kwargs = dict(layers=[num_nodes for _ in range(num_layers)])
        model = TD3(MlpPolicy,
                    env,
                    verbose=1,
                    action_noise=action_noise,
                    learning_rate=learning_rate,
                    policy_kwargs=policy_kwargs,
                    tensorboard_log=trial_path)

    #PPO1 calls back only after every rollout
    elif alg == 'ppo2':
        from stable_baselines.common.policies import MlpPolicy
        from stable_baselines import PPO2
        call_iter = 100
        policy_kwargs = dict(net_arch=[num_nodes for _ in range(num_layers)])
        model = PPO2(MlpPolicy,
                     env,
                     policy_kwargs=policy_kwargs,
                     verbose=1,
                     learning_rate=learning_rate,
                     tensorboard_log=trial_path,
                     n_steps=alg_param['n_steps'],
                     noptepochs=alg_param['noptepochs'],
                     nminibatches=alg_param['nminibatches'],
                     gamma=alg_param['gamma'],
                     ent_coef=alg_param['ent_coef'],
                     cliprange=alg_param['cliprange'],
                     lam=alg_param['lam'])

    best_mean_reward, n_steps = -np.inf, 0

    #callback frequency differs among algorithms
    def callback(_locals, _globals):
        from stable_baselines.results_plotter import load_results, ts2xy
        nonlocal n_steps, best_mean_reward, call_iter
        # Print stats every 1000 call
        if (n_steps + 1) % call_iter == 0:
            # Evaluate policy training performance
            x, y = ts2xy(load_results(trial_path), 'timesteps')
            if len(x) > 0:
                mean_reward = np.mean(y[-200:])
                print(x[-1], 'timesteps')
                print(
                    "Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}"
                    .format(best_mean_reward, mean_reward))
                # New best model, you could save the agent here
                if mean_reward > best_mean_reward:
                    best_mean_reward = mean_reward
                    # Example for saving best model
                    print("Saving new best model")
                    _locals['self'].save(trial_path + '/best_model.pkl')
        n_steps += 1
        return True

    # model= DDPG.load('log/A00/best_model.pkl')
    # model.set_env(env)
    print(f"Starting to train {trial_id}")
    model.learn(total_timesteps=int(1e6),
                tb_log_name='tb_log',
                callback=callback)

    model.save(trial_path + '/fully_trained_model')
Exemple #16
0
	#run suscriber nodes
	RL_subscribers()
	print("Starting...")

	time.sleep(3) #give ros time to set up

	#init environmnet
	env = soft_learner()
	
	print('done')

	

	a_dim = env.action_space.shape[0]
	# td3_noise = OrnsteinUhlenbeckActionNoise(np.zeros(a_dim), .9*np.ones(a_dim)) 
	td3_noise = NormalActionNoise(0,SIGMA)
	td3_env = DummyVecEnv([lambda: env])
	# td3_env = env
	
	checkpoint_on_event = CheckpointCallback(save_freq=1000, save_path= "./logs/model_checkpoints",
                                         name_prefix='rl_model')
	event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event)

	eval_callback = EvalCallback(td3_env, best_model_save_path='./logs/',
                             log_path='./logs/', eval_freq=100,
                             deterministic=True, render=False)

	

	# td3_model.learning_starts = 100
	
Exemple #17
0
SIM_NUMBER = 999 
##Training policies
#CustomPolicy_3 
#CustomPolicy_2  Standard mlp stable baselines policy with modified layer-size
#CustomPolicy_4  Modified initialization of layers and layer-size
policy = CustomPolicy_4




env.episode_duration = NSTEPS
## the noise objects for DDPG
n_actions = env.action_space.shape[-1]
param_noise = None
action_noise =  NormalActionNoise(0,range_esp)
#
# model = DDPG(policy, env, verbose=1,nb_train_steps=NSTEPS, nb_rollout_steps=NSTEPS,nb_eval_steps=NSTEPS,gamma=DECAY_RATE, param_noise=None, action_noise=action_noise,batch_size=BATCH_SIZE,actor_lr=POLICY_LEARNING_RATE,
#                critic_lr =  QVALUE_LEARNING_RATE,buffer_size=REPLAY_SIZE,tau= UPDATE_RATE)
model = DDPG(policy, env, verbose=1,nb_train_steps=1, nb_rollout_steps=1,nb_eval_steps=0,gamma=DECAY_RATE, param_noise=None, action_noise=action_noise,batch_size=BATCH_SIZE,actor_lr=POLICY_LEARNING_RATE,
               critic_lr =  QVALUE_LEARNING_RATE,buffer_size=REPLAY_SIZE,tau= UPDATE_RATE)

# mean_reward, std_reward = evaluate_policy(model, env_eval, n_eval_episodes=10)
# print(f'Mean reward: {mean_reward} +/- {std_reward:.2f}')

start_time = time.time()
model.learn(total_timesteps=NSTEPS*NEPISODES)
end_time=time.time()
elapsed_time = end_time-start_time
model.save("ddpg_pendulum_stb_baselines_"+str(SIM_NUMBER))
print('elapsed '+str(elapsed_time)+'s')
Exemple #18
0
    def __init__(self, algorithm="SAC", load=True, agent_name="Agent001"):
        self.agent_name = agent_name

        #self.env = LearningRocket(visualize=False)
        #self.env = NormalizeActionWrapper(self.env)

        #self.eval_env = LearningRocket(visualize=True)
        #self.eval_env = NormalizeActionWrapper(self.eval_env)

        #self.env = SubprocVecEnv([lambda: LearningRocket(visualize=False) for i in range(4)])
        self.env = make_vec_env(
            LearningRocket, n_envs=16
        )  #[lambda: LearningRocket(visualize=False) for i in range(16)]))
        #self.eval_env = VecNormalize(DummyVecEnv([lambda: LearningRocket(visualize=True) for i in range(1)]))
        self.eval_env = make_vec_env(lambda: LearningRocket(visualize=True),
                                     n_envs=1)
        #self.eval_env = VecNormalize(self.eval_env)
        self.eval_callback = EvalCallback(self.eval_env,
                                          best_model_save_path='Agent007',
                                          log_path='./logs/',
                                          eval_freq=10000,
                                          deterministic=True,
                                          render=False,
                                          n_eval_episodes=1)
        kai_policy = dict(act_fun=tf.nn.tanh, net_arch=[400, 300])
        #check_env(self.env, warn=True)
        """
        if algorithm == "SAC":
            if load is True:
                self.model = SAC.load(agent_name, env=self.env, tensorboard_log="./rocket_tensorboard/")
                #self.model.ent_coef=0.2
            else:
                self.model = SAC('MlpPolicy', self.env, verbose=1, tensorboard_log="./rocket_tensorboard/",ent_coef=5)
            print("Trainer Set for SAC")
        """
        if algorithm == "TD3":
            n_actions = self.env.action_space.shape[-1]
            action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                             sigma=0.1 * np.ones(n_actions))
            if load is True:
                self.model = TD3.load(agent_name,
                                      env=self.env,
                                      tensorboard_log="./rocket_tensorboard/")
                #file = open('replay_buffer', 'rb')
                #self.model.replay_buffer = pickle.load(file)
                #file.close()
            else:
                self.model = TD3(MlpPolicy,
                                 self.env,
                                 action_noise=action_noise,
                                 batch_size=768,
                                 gamma=0.95,
                                 learning_rate=1e-4,
                                 learning_starts=20000,
                                 verbose=1,
                                 tensorboard_log="./rocket_tensorboard/",
                                 policy_kwargs=dict(layers=[400, 300]))
            print("Trainer Set for TD3")
        elif algorithm == "PPO2":
            if load is True:
                self.model = PPO2.load(agent_name,
                                       env=self.env,
                                       tensorboard_log="./rocket_tensorboard/")
                self.eval_env = VecNormalize.load(self.agent_name + "vEnv",
                                                  self.eval_env)
                #self.eval_env.clip_obs = 500
                #self.env = VecNormalize(self.env)
                self.env = VecNormalize.load(self.agent_name + "vEnv",
                                             self.env)
                #self.env.clip_obs = 500
                #self.env.norm_obs = False
                #self.eval_env.norm_obs = False
            else:
                self.model = PPO2(PPOMlpPolicy,
                                  self.env,
                                  n_steps=1024,
                                  nminibatches=32,
                                  lam=0.98,
                                  gamma=0.999,
                                  noptepochs=4,
                                  ent_coef=0.01,
                                  verbose=1,
                                  tensorboard_log="./rocket_tensorboard/",
                                  policy_kwargs=dict(layers=[400, 300]))
                self.eval_env = VecNormalize(self.eval_env)
                self.env = VecNormalize(self.env)
                #self.eval_env.clip_obs = 500
                #self.env.clip_obs = 500
                #self.env.norm_obs=False
                #self.eval_env.norm_obs=False

                print("Trainer set for PPO2. I am speed.")
Exemple #19
0
        n_actions = env.action_space.shape[0]
        if 'adaptive-param' in noise_type:
            assert algo_ == 'ddpg', 'Parameter is not supported by SAC'
            hyperparams['param_noise'] = AdaptiveParamNoiseSpec(
                initial_stddev=noise_std, desired_action_stddev=noise_std)
        elif 'normal' in noise_type:
            if 'lin' in noise_type:
                hyperparams['action_noise'] = LinearNormalActionNoise(
                    mean=np.zeros(n_actions),
                    sigma=noise_std * np.ones(n_actions),
                    final_sigma=hyperparams.get('noise_std_final', 0.0) *
                    np.ones(n_actions),
                    max_steps=n_timesteps)
            else:
                hyperparams['action_noise'] = NormalActionNoise(
                    mean=np.zeros(n_actions),
                    sigma=noise_std * np.ones(n_actions))
        elif 'ornstein-uhlenbeck' in noise_type:
            hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise(
                mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions))
        else:
            raise RuntimeError('Unknown noise type "{}"'.format(noise_type))
        print("Applying {} noise with std {}".format(noise_type, noise_std))
        del hyperparams['noise_type']
        del hyperparams['noise_std']
        if 'noise_std_final' in hyperparams:
            del hyperparams['noise_std_final']

    if ALGOS[args.algo] is None:
        raise ValueError('{} requires MPI to be installed'.format(args.algo))
Exemple #20
0
import pytest
import numpy as np

from stable_baselines import TD3, PPO
from stable_baselines.common.noise import NormalActionNoise

action_noise = NormalActionNoise(np.zeros(1), 0.1 * np.ones(1))


def test_td3():
    model = TD3('MlpPolicy',
                'Pendulum-v0',
                policy_kwargs=dict(net_arch=[64, 64]),
                seed=0,
                learning_starts=100,
                verbose=1,
                create_eval_env=True,
                action_noise=action_noise)
    model.learn(total_timesteps=10000, eval_freq=5000)
    # model.save("test_save")
    # model.load("test_save")
    # os.remove("test_save.zip")


@pytest.mark.parametrize("model_class", [PPO])
@pytest.mark.parametrize("env_id", ['CartPole-v1', 'Pendulum-v0'])
def test_onpolicy(model_class, env_id):
    model = model_class('MlpPolicy',
                        env_id,
                        policy_kwargs=dict(net_arch=[16]),
                        verbose=1,
Exemple #21
0
    def _preprocess_hyperparams(self, _hyperparams):
        # Convert to python object if needed
        if "policy_kwargs" in _hyperparams.keys() and isinstance(_hyperparams["policy_kwargs"], str):
            _hyperparams["policy_kwargs"] = eval(_hyperparams["policy_kwargs"])

        n_timesteps = _hyperparams.pop("n_timesteps", None)
        n_envs = _hyperparams.pop("n_envs", None)
        log_every = _hyperparams.pop("log_every", None)
        if not self.continue_learning:
            if not log_every:
                self.logger.debug("log_every not defined in yml file: using command line log_every {}".format(self.log_every))
                log_every = self.log_every
            else:
                self.logger.debug("using log_every as defined in yml file: {}".format(log_every))
        else:
            self.logger.debug("priority to command line log_every {}".format(self.log_every))
            log_every = self.log_every

        # Parse noise string
        if self.algo_name in ["ddpg", "sac", "td3"] and _hyperparams.get("noise_type") is not None:
            noise_type = _hyperparams["noise_type"].strip()
            noise_std = _hyperparams["noise_std"]
            n_actions = get_n_actions(env_name=self.env_name, env_variables=self.env_kwargs)
            self.logger.debug("n_actions: {}".format(n_actions))
            if "adaptive-param" in noise_type:
                assert self.algo_name == "ddpg", "Parameter is not supported by SAC"
                _hyperparams["param_noise"] = AdaptiveParamNoiseSpec(initial_stddev=noise_std, desired_action_stddev=noise_std)
            elif "normal" in noise_type:
                if "lin" in noise_type:
                    _hyperparams["action_noise"] = LinearNormalActionNoise(
                        mean=np.zeros(n_actions),
                        sigma=noise_std * np.ones(n_actions),
                        final_sigma=_hyperparams.get("noise_std_final", 0.0) * np.ones(n_actions),
                        max_steps=n_timesteps,
                    )
                else:
                    _hyperparams["action_noise"] = NormalActionNoise(
                        mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)
                    )
            elif "ornstein-uhlenbeck" in noise_type:
                _hyperparams["action_noise"] = OrnsteinUhlenbeckActionNoise(
                    mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)
                )
            else:
                raise RuntimeError('Unknown noise type "{}"'.format(noise_type))
            self.logger.debug("Applying {} noise with std {}".format(noise_type, noise_std))
            del _hyperparams["noise_type"]
            del _hyperparams["noise_std"]
            if "noise_std_final" in _hyperparams:
                del _hyperparams["noise_std_final"]

        normalize_kwargs = _parse_normalize(dictionary=_hyperparams)

        if n_envs is None:
            self.logger.debug("n_envs not defined in yml file: using command line n_envs {}".format(self.num_envs))
            n_envs = self.num_envs
        else:
            self.logger.debug("using n_envs as num of envs defined in yml file:".format(n_envs))

        if not self.continue_learning:
            # priority to yml defined n_timesteps
            if n_timesteps is None:
                self.logger.debug(
                    "n_timesteps not defined in yml file: using command line n_timesteps {}".format(self.train_total_timesteps)
                )
                n_timesteps = self.train_total_timesteps
            else:
                self.logger.debug("using n_timesteps as total timesteps defined in yml file: {}".format(n_timesteps))
                n_timesteps = int(n_timesteps)
        else:
            if self.train_total_timesteps and self.train_total_timesteps != -1:
                assert self.train_total_timesteps <= int(n_timesteps), "train_total_timesteps <= n_timesteps: {}, {}".format(
                    self.train_total_timesteps, n_timesteps
                )
                # priority to command line n_timesteps
                self.logger.debug("priority to command line n_timesteps {}".format(self.train_total_timesteps))
                n_timesteps = self.train_total_timesteps
            elif self.train_total_timesteps == -1:
                assert n_timesteps, "n_timesteps should have a value: {}".format(n_timesteps)
                n_timesteps = int(n_timesteps)
                self.logger.info("training in continual learning = training from scratch. n_timesteps {}".format(n_timesteps))
            else:
                assert n_timesteps, "n_timesteps should have a value: {}".format(n_timesteps)
                n_timesteps = int(n_timesteps // 2)
                self.logger.debug(
                    "train_total_timesteps not specified in continue_learning: "
                    "taking half of original n_timesteps defined in yml file {}".format(n_timesteps)
                )

        assert n_timesteps % log_every == 0, "it should be possible to divide n_timesteps for log_every: {}, {}".format(
            n_timesteps, log_every
        )
        return normalize_kwargs, n_envs, n_timesteps, log_every, _hyperparams
        act = tf.nn.tanh

    if args.algo == 'TD4_IQN':
        model = TD4('MlpPolicy',
                    env,
                    gamma=0.99,
                    buffer_size=int(1e5),
                    learning_starts=10000,
                    tau=args.tau,
                    policy_delay=args.policy_delay,
                    batch_size=128,
                    learning_rate=1e-3,
                    train_freq=args.train_freq,
                    gradient_steps=args.train_freq,
                    verbose=args.verbose,
                    action_noise=NormalActionNoise(0, sigma=0.1),
                    n_support=args.n_support,
                    risk_factor=args.riskfactor,
                    policy_kwargs=dict(layers=[128, 128], act_fun=act),
                    model_type="IQN",
                    tensorboard_log=args.logdir + env_name,
                    seed=args.seed)
    if args.algo == 'TD4_FQF':
        model = TD4('MlpPolicy',
                    env,
                    gamma=0.99,
                    buffer_size=int(1e5),
                    learning_starts=10000,
                    tau=args.tau,
                    policy_delay=args.policy_delay,
                    batch_size=128,
Exemple #23
0
import gym
import numpy as np
import gym
import gym_routing

from stable_baselines.ddpg.policies import MlpPolicy
from stable_baselines.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise, AdaptiveParamNoiseSpec
from stable_baselines import DDPG

env = gym.make('zzz-v1')

# the noise objects for DDPG
n_actions = env.action_space.shape[-1]
param_noise = None
#action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                 sigma=float(0.5) * np.ones(n_actions))

try:
    model = DDPG.load(
        "ddpg_0308",
        env=env,
        tensorboard_log=
        "/home/carla/openai_baselines_update/stable_baseline/log/0308/")
    print("load saved model")
except:
    model = DDPG(
        MlpPolicy,
        env,
        verbose=1,
        param_noise=param_noise,
        action_noise=action_noise,