os.makedirs(tensorboard_folder)
if not os.path.isdir(model_folder):
    os.makedirs(model_folder)

policy = ''
model_tag = ''
if len(sys.argv) > 1:
    policy = sys.argv[1]
    model_tag = '_' + sys.argv[1]

env = DummyVecEnv([lambda: ActionMaskEnv()])
env = VecFrameStack(env, 3)

model = PPO2(get_policy(policy),
             env,
             verbose=0,
             nminibatches=1,
             tensorboard_log=tensorboard_folder)
model.learn(total_timesteps=2500000, tb_log_name='PPO2' + model_tag)

model.save(model_folder + "PPO2" + model_tag)
del model
model = PPO2.load(model_folder + "PPO2" + model_tag)

done = False
states = None
action_masks = []
obs = env.reset()

while not done:
    action, states = model.predict(obs, states, action_mask=action_masks)
Ejemplo n.º 2
0
    def __init__(self,
                 sim_env_name='Hopper-v2',
                 real_env_name='HopperModified-v2',
                 frames=NUM_FRAMES_INPUT,
                 num_cores=NUM_CORES,
                 num_rl_threads=NUM_RL_THREADS,
                 load_policy=None,
                 algo=None):
        self.env_name = sim_env_name
        self.real_env_name = real_env_name
        self.frames = frames
        self.num_cores = num_cores
        self.fwd_norms_x = (0., 1.)
        self.fwd_norms_y = (0., 1.)
        self.inv_norms_x = (0., 1.)
        self.inv_norms_y = (0., 1.)
        self.num_rl_threads = num_rl_threads
        self.real_env = SubprocVecEnv(
            [lambda: gym.make(self.real_env_name) for i in range(self.num_cores)])
        print('MODIFIED ENV BODY_MASS : ',
              gym.make(self.real_env_name).model.body_mass)
        self.sim_env = SubprocVecEnv(
            [lambda: gym.make(self.env_name) for i in range(self.num_cores)])
        print('SIMULATED ENV BODY_MASS : ',
              gym.make(self.env_name).model.body_mass)

        # lists to reuse experience from previous grounding steps
        self.fwd_model_x_list = []
        self.fwd_model_y_list = []
        self.inv_model_x_list = []
        self.inv_model_y_list = []

        # initialize target policy
        if load_policy is None:
            print('LOADING -RANDOM- INITIAL POLICY')
            self.target_policy = PPO2(
                MlpPolicy,
                env=self.sim_env,
                verbose=1,
                tensorboard_log='data/TBlogs/' + self.env_name)
        else:
            print('LOADING -PRETRAINED- INITIAL POLICY')
            # self.target_policy = SAC.load(
            #     load_policy,
            #     env=SubprocVecEnv([lambda: gym.make(self.env_name)]),
            #     tensorboard_log='data/TBlogs/'+self.env_name,
            #     verbose=1,
            #     batch_size=256,
            #     buffer_size=1000000,
            # )
            # TODO: write easy way to switch algorithms
            # self.target_policy = PPO2.load(
            #         load_policy,
            #         env=SubprocVecEnv([lambda: gym.make(self.env_name)]),
            #         tensorboard_log='TBlogs/'+self.env_name,
            #         verbose=1,
            #         n_steps=256,
            #         # buffer_size=1000000,
            #     )

            n_actions = self.sim_env.action_space.shape[-1]
            action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))
            self.target_policy = TD3.load(
                load_policy,
                env=DummyVecEnv([lambda: gym.make(self.env_name)]),
                tensorboard_log='data/TBlogs/'+self.env_name,
                verbose=1,
                batch_size=128,
                gamma=0.99,
                learning_rate=0.001,
                action_noise=action_noise,
                buffer_size=1000000,
            )

        # define the Grounded Action Transformer models here
        self._init_gat_models()
        self.grounded_sim_env = None
Ejemplo n.º 3
0
print(test_df.describe())

test_env = DummyVecEnv([
    lambda: BitcoinTradingEnv(test_df,
                              reward_func=reward_strategy,
                              forecast_len=int(params['forecast_len']),
                              confidence_interval=params['confidence_interval']
                              )
])

model_params = {
    'n_steps': int(params['n_steps']),
    'gamma': params['gamma'],
    'learning_rate': params['learning_rate'],
    'ent_coef': params['ent_coef'],
    'cliprange': params['cliprange'],
    'noptepochs': int(params['noptepochs']),
    'lam': params['lam'],
}

model = PPO2.load('./agents/ppo2_' + reward_strategy + '_' + str(curr_idx) +
                  '.pkl',
                  env=test_env)

obs, done = test_env.reset(), False
while not done:
    action, _states = model.predict(obs)
    obs, reward, done, info = test_env.step(action)

    test_env.render(mode="system")
Ejemplo n.º 4
0
def generate_checkpoint_from_model(model, checkpoint_name):
    with model.graph.as_default():
        # if os.path.exists(checkpoint_name):
        #     shutil.rmtree(checkpoint_name)

        tf.saved_model.simple_save(
            model.sess,
            checkpoint_name,
            inputs={"obs": model.act_model.obs_ph},
            outputs={"action": model.act_model._deterministic_action})


if __name__ == '__main__':
    if os.path.isdir(file):
        shutil.rmtree(file)
    model = PPO2.load(file)

    generate_checkpoint_from_model(model, file)
    converter = tf.lite.TFLiteConverter.from_saved_model(file)
    tflite_model = converter.convert()
    open(file + "/converted_model.tflite", "wb").write(tflite_model)

    # multiprocess environment
    n_cpu = 1
    # 'Balboa-balance-ctrl-render-v1'
    env = gym.make('AntPyBulletEnv-v0')
    env.render(mode="human")
    obs = env.reset()
    # When using VecEnv, done is a vector

    while True:
import roboschool
import gym

from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2

env = gym.make('RoboschoolHopper-v1')
env = DummyVecEnv([lambda: env
                   ])  # The algorithms require a vectorized environment to run

model = PPO2(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=10000)

obs = env.reset()
for i in range(1000000):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
env.close()
Ejemplo n.º 6
0
#parser.add_argument('--no-train', dest='train', action='store_false')
#parser.set_defaults(train=True)
args = parser.parse_args()

import logging
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2
from aegis_core.aegis_env import AegisEnv

log = logging.getLogger('werkzeug')
log.setLevel(logging.ERROR)

# Create environment
env = AegisEnv(args.input_shape, args.output_shape, args.urls, port=args.port,
  discrete=args.discrete, niceness=args.niceness, n_steps=args.steps,
  reward_propagation=args.reward_prop)
env = DummyVecEnv([lambda: env])

#load model
model = PPO2.load(args.path, env, verbose=args.verbose, tensorboard_log=args.logdir)

#train
ep_counter = 0
while True:
  env.reset() #TODO: is this necessary?
  model.learn(total_timesteps=args.steps, reset_num_timesteps=False, tb_log_name=args.name)
  ep_counter += 1
  #TODO: actual step counter might be off because .learn might have different intervals
  print("Steps: {}".format(ep_counter * args.steps))
  model.save(args.path)
sps = my_signal_rate / my_signal_repetitions

env = CustomEnv(signal_rate=my_signal_rate,
                signal_repetitions=my_signal_repetitions,
                step_limit=my_step_limit,
                number_of_gears=my_number_of_gears,
                gear_interval=my_gear_interval)
# Optional: PPO2 requires a vectorized environment to run
# the env is now wrapped automatically when passing it to the constructor
# env = DummyVecEnv([lambda: env])

my_learning_rate = 0.01  # 0.01 is probably a good value for training <1h
timesteps = 5000
model = PPO2(MlpPolicy,
             env,
             learning_rate=my_learning_rate,
             verbose=1,
             tensorboard_log="/home/fritz/Documents/TensorBoardLogs"
             )  # defaults: learning_rate=2.5e-4,
model.learn(total_timesteps=timesteps)

name = "ppo2_franka_SHIFTING_GEARS_learning_rate_" + str(
    my_learning_rate) + "_sps_" + str(sps) + "_timesteps_" + str(timesteps)
model.save(name)  # + str(my_learning_rate))

f = open("envparameters_" + name, "x")
f.write(
    str([
        my_signal_rate, my_signal_repetitions, my_step_limit,
        my_number_of_gears, my_gear_interval
    ]))
f.close()
Ejemplo n.º 8
0
    return envs


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Train script for reinforcement learning")
    parser.add_argument('--algo', type=str, default='ppo2')
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--log-dir', type=str, default='/logs/')
    parser.add_argument('--env', type=str, default='KukaButtonGymEnv-v0')
    parser.add_argument('--num-timsteps', type=int, default=int(1e5))
    parser.add_argument('--obs-type', type=str, default='ground_truth')
    parser.add_argument('--num-cpu', type=int, default=1)
    args, unknown = parser.parse_known_args()

    env_class = InmoovGymEnv
    # env default kwargs
    default_env_kwargs = {
        k: v.default
        for k, v in inspect.signature(env_class.__init__).parameters.items()
        if v is not None
    }
    env = createEnvs(args)
    tt()
    model = PPO2(policy=MlpPolicy,
                 env=env,
                 learning_rate=lambda f: f * 2.5e-4,
                 verbose=1)
    model.learn(total_timesteps=args.num_timsteps, seed=args.seed)
Ejemplo n.º 9
0
    # callback for evaluation
    eval_callback = EvalCallback(env,
                                 best_model_save_path=specified_path,
                                 log_path=specified_path,
                                 eval_freq=100000,
                                 n_eval_episodes=5,
                                 verbose=1,
                                 deterministic=False,
                                 render=False)

    # train model
    try:
        try:
            model_path = join(specified_path, 'best_model.zip')
            model = PPO2.load(model_path,
                              env=env_8,
                              tensorboard_log=specified_path)
            # model = PPO2('MlpPolicy', env=env_8, tensorboard_log=specified_path, **model_config).load(args.modelpath, env=env_8)
            print("Existing model loaded from directory")

        except:
            model = PPO2(policy,
                         env=env_8,
                         tensorboard_log=specified_path,
                         **model_config)
            print('New model created.')

        #Pretrain the model
        print('Starting pre-training of the model..')
        model.pretrain(dataset, n_epochs=100)
        model.save(join(specified_path, 'pretrained-model.zip'))
Ejemplo n.º 10
0
import warnings
from stable_baselines import TRPO, PPO2, SAC, ACKTR, DDPG, TD3, ACER, DQN
from stable_baselines.common.vec_env import DummyVecEnv, SubprocVecEnv
from navigation_env import NavigationEnvDefault
from default_config import config
from stable_baselines.gail import generate_expert_traj
from stable_baselines.gail import ExpertDataset

if __name__ == "__main__":
    env = SubprocVecEnv(
        [lambda: NavigationEnvDefault(**config) for _ in range(32)])
    model = PPO2(env=env,
                 policy="MlpLstmPolicy",
                 n_steps=32,
                 nminibatches=4,
                 tensorboard_log='./',
                 verbose=1)
    model.learn(1000000000)
    model.save("recurrent_nodelay")
Ejemplo n.º 11
0
print("act dim : ", env.action_space)

loadFileIndex = 2
loadFileString = "ppo2_pybulletAnt_6_end_{}".format(loadFileIndex)
saveFileString = "ppo2_pybulletAnt_6_end_{}".format(loadFileIndex + 1)

print("loadFile : ", loadFileString)
print("saveFile : ", saveFileString)

isTrain = True
isContinue = True
if isTrain:
  print("start training =========================================")

  if not isContinue:
    model = PPO2(MlpPolicy, env, verbose=1)
  else:
    print("load model =========================================")
    model = PPO2.load(loadFileString, env)

  model.learn(total_timesteps=1000000)
  print("end training =========================================")
  
  model.save(saveFileString)
  print("saved model =========================================")
else:
  print("load model =========================================")
  model = PPO2.load(loadFileString, env)

  print("start test =========================================")
Ejemplo n.º 12
0
plt.tick_params(bottom=False,
                labelbottom=False,
                left=False,
                labelleft=False,
                right=False,
                labelright=False,
                top=False,
                labeltop=False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)

for mf in model_files:
    iteration = get_model_iteration(mf)
    model = PPO2.load(mf)

    ims = []
    observation = env.reset()

    done_count = 0
    while True:
        img = env.render()
        im = plt.imshow(img)
        ims.append([im])

        action, _ = model.predict(observation)
        observation, reward, done, info = env.step(action)

        if done:
            done_count += 1
Ejemplo n.º 13
0
import pybullet_envs.bullet.minitaur_gym_env as e
from stable_baselines import PPO2

total_timesteps = 1000000
model = PPO2.load("./model/model{}".format(total_timesteps))
env = e.MinitaurBulletEnv(render=True)
obs = env.reset()
for i in range(50000):
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    env.render(mode="human")
env.close()
Ejemplo n.º 14
0
import os
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2, DQN
from envs import Trainer

# Environment
env = DummyVecEnv([lambda: Trainer()])

# Train
model = PPO2(MlpPolicy, env, tensorboard_log='log')
model.learn(total_timesteps=100000)

# Save model
model.save('model')

# Play
episodes = 1
for e in range(episodes):
    obs = env.reset()
    done = False
    i = 0
    while not done:
        # Perform action
        action, _states = model.predict(obs)
        # Perform action
        obs, reward, done, _ = env.step(action)
        print('Iteration: {i} - Action: {a} - Reward: {r}'.format(i=i,
                                                                  a=action,
                                                                  r=reward))
        # Render environment
def main():

    parser = argparse.ArgumentParser(
        description='Plotting mechanisms for GARAT and related modifications')
    parser.add_argument('--sim_env',
                        default="InvertedPendulum-v2",
                        type=str,
                        help="Name of the simulator/source environment")
    parser.add_argument('--real_env',
                        default="InvertedPendulumModified-v2",
                        type=str,
                        help="Name of the real/target environment")
    parser.add_argument(
        '--load_policy_path',
        default=
        "data/models/TRPO_initial_policy_steps_InvertedPendulum-v2_2000000_.pkl",
        help="relative path of policy to be used for generating plots")
    parser.add_argument(
        '--load_atp_path',
        default=
        "data/models/garat/Single_GAIL_sim2real_TRPO_2000000_1000_50_0/",
        type=str,
        help="relative path for stored Action transformation policies")
    parser.add_argument('--seed', default=0, type=int, help="Random seed")
    args = parser.parse_args()

    #Set seed
    np.random.seed(args.seed)

    sim_env = gym.make(args.sim_env)
    real_env = gym.make(args.real_env)

    policy = TRPO.load(args.load_policy_path)

    action_tf_policy_list_single = []
    action_tf_policy_list_double = []
    action_tf_policy_list_shared_double = []
    action_tf_policy_list_airl = []
    num_grounding = 50

    atp_path_single = args.load_atp_path
    atp_path_double = args.load_atp_path.replace('_0', '_2')
    atp_path_shared_double = args.load_atp_path.replace('_0', '_1')
    atp_path_airl = args.load_atp_path.replace(
        'Single_GAIL_sim2real_TRPO_2000000_1000_50_0',
        'Single_AIRL_sim2real_TRPO_2000000_1000_50_1')

    print('################## Begin File loading ##################')
    for index in range(num_grounding):
        file_path_single = os.path.join(
            atp_path_single,
            "action_transformer_policy1_" + str(index) + ".pkl")
        print(file_path_single)
        action_tf_policy_list_single.append(PPO2.load(file_path_single))
        file_path_double = os.path.join(
            atp_path_double,
            "action_transformer_policy1_" + str(index) + ".pkl")
        print(file_path_double)
        action_tf_policy_list_double.append(PPO2.load(file_path_double))
        file_path_shared_double = os.path.join(
            atp_path_shared_double,
            "action_transformer_policy1_" + str(index) + ".pkl")
        print(file_path_shared_double)
        action_tf_policy_list_shared_double.append(
            PPO2.load(file_path_shared_double))
        #file_path_airl = os.path.join(atp_path_airl,"action_transformer_policy1_"+str(index)+".pkl")
        #print(file_path_airl)
        #action_tf_policy_list_airl.append(PPO2.load(file_path_airl))
    results_dict = {}
    print('################## File loading Completed ##################')

    results_single = calculate_transition_errors(sim_env, real_env, policy,
                                                 action_tf_policy_list_single)

    print('############## Begin Double Discriminator Calculations')

    results_shared_double = calculate_transition_errors(
        sim_env, real_env, policy, action_tf_policy_list_shared_double)

    results_double = calculate_transition_errors(sim_env, real_env, policy,
                                                 action_tf_policy_list_double)

    print('############## Begin AIRL Calculations')

    #results_airl = calculate_transition_errors(sim_env, real_env, policy, action_tf_policy_list_airl)

    results_dict['GARAT'] = results_single
    results_dict['GARAT Double Discriminator'] = results_double
    results_dict[
        'GARAT Double Discriminator (Generator LR modifications)'] = results_shared_double
    #results_dict['GARAT AIRL'] = results_airl

    plot_results(results_dict)
Ejemplo n.º 16
0
    # Create the callback list
    callback = CallbackList([checkpoint_callback, eval_callback])

    lr_sch = LinearSchedule(int(10e6), 1.0e-5, 2.5e-4)

    model = PPO2(
        policy=MlpPolicy,
        env=env,
        verbose=1,
        tensorboard_log="./ppo2_docking_tensorboard/",
        policy_kwargs=dict(net_arch=[128, dict(pi=[128], vf=[128])],
                           act_fun=tf.nn.relu),
        lam=0.95,
        gamma=0.99,  # lower 0.9 ~ 0.99
        # n_steps=math.floor(cfg['env']['max_time'] / cfg['env']['ctl_dt']),
        n_steps=600,
        ent_coef=0.00,
        learning_rate=3e-4,
        # learning_rate=lr_sch.value,
        # learning_rate=linear_schedule(3e-4),
        vf_coef=0.5,
        max_grad_norm=0.5,
        nminibatches=10,
        noptepochs=10,
        cliprange=0.2)

    # load trained model
    # model = PPO2.load("ppo2_docking_621_shaping_10M", env=env, tensorboard_log="./ppo2_docking_tensorboard/")

    model.learn(total_timesteps=int(10e6), callback=callback)
Ejemplo n.º 17
0
from stable_baselines import PPO2


def env_create():
    env = ClientDapr("ActorOpenAI")
    env.create("CartPole-v1")

    print(f"[Client] Created Actor {env.actor_id}", flush=True)

    return env


print("===============================================")
print("INFERING")
print("===============================================")
model = PPO2.load("baselines_ppo_cartpole")
env_local = env_create()

# Start monitoring
print("[Client] Starting to monitor", flush=True)
env_local.monitor_start(1)

# Run Experiment
obs = env_local.reset()
is_done = False

while not (is_done):
    action, _states = model.predict(obs)
    obs, rewards, is_done, info = env_local.step(action)

# Stop Monitoring
Ejemplo n.º 18
0
def train(env_id, num_timesteps, seed, policy, n_envs=8, nminibatches=4,
          n_steps=128, peer=0., scheduler=None, individual=False, repeat=1):
    """
    Train PPO2 model for atari environment, for testing purposes

    :param env_id: (str) the environment id string
    :param num_timesteps: (int) the number of timesteps to run
    :param seed: (int) Used to seed the random generator.
    :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...)
    :param n_envs: (int) Number of parallel environments
    :param nminibatches: (int) Number of training minibatches per update.
        For recurrent policies, the number of environments run in parallel
        should be a multiple of nminibatches.
    :param n_steps: (int) The number of steps to run for each environment
        per update (i.e. batch size is n_steps * n_env where n_env is
        number of environment copies running in parallel)
    """

    policy = {
        'cnn': CnnPolicy,
        'lstm': CnnLstmPolicy,
        'lnlstm': CnnLnLstmPolicy,
        'mlp': MlpPolicy
    }[policy]

    is_atari = 'NoFrameskip' in env_id
    make_env = lambda: VecFrameStack(make_atari_env(env_id, n_envs, seed), 4) if is_atari \
        else make_vec_env(env_id, n_envs, seed)
    print(make_env)

    models = {
        "A": PPO2(
            policy=policy, policy_kwargs={'view': 'even'}, n_steps=n_steps,
            env=make_env(), nminibatches=nminibatches, lam=0.95, gamma=0.99, 
            noptepochs=4, ent_coef=.01, learning_rate=2.5e-4,
            cliprange=lambda f: f * 0.1, verbose=1),
        "B": PPO2(
            policy=policy, policy_kwargs={'view': 'odd'}, n_steps=n_steps,
            env=make_env(), nminibatches=nminibatches, lam=0.95, gamma=0.99, 
            noptepochs=4, ent_coef=.01, learning_rate=2.5e-4,
            cliprange=lambda f: f * 0.1, verbose=1)}

    views = {view: View(models[view], peer=peer) for view in ("A", "B")}

    n_batch = n_envs * n_steps
    n_updates = num_timesteps // n_batch

    for t in range(n_updates):
        logger.info("current episode:", t)
        for view in "A", "B":
            models[view].learn(n_batch)
        if not individual:
            for view, other_view in zip(("A", "B"), ("B", "A")):
                obses, _, _, actions, _, _, _, _, _ = models[other_view].rollout
                views[view].peer = peer * scheduler(t)
                logger.info("current alpha:", views[view].peer)
                for _ in range(repeat):
                    views[view].learn(
                        obses, actions, views[view].learning_rate / repeat)

    for view in "A", "B":
        models[view].env.close()
        del models[view]  # free memory
Ejemplo n.º 19
0
        'D': [450, 450, 5000]
    },
    'Linear_To_Angular_Scaler': [1, 1, 0],
    'Yaw_Rate_Scaler': 0.18,
    'Angular_PID': {
        'P': [24000, 24000, 1500],
        'I': [0, 0, 1.2],
        'D': [12000, 12000, 0]
    },
    'Angular_PID2': {
        'P': [4000, 4000, 1500],
        'I': [0, 0, 1.2],
        'D': [1500, 1500, 0]
    },
}

env = Quad_Env()
env = make_vec_env(lambda: env, n_envs=1)
# If the environment don't follow the interface, an error will be thrown
obs = env.reset()

model = PPO2(MlpLnLstmPolicy,
             env,
             nminibatches=1,
             tensorboard_log="./stationary_env_ppo/")

model.learn(total_timesteps=100000, log_interval=4000)

model.save("ppo_30rotor_fault_blending")

print("Training complete - agent saved")
Ejemplo n.º 20
0
# ============ Number of days trained =============
REPEAT_NO = 10
tstep_list = [200000]
# tstep_list = [50000, 100000]
# tstep_list = [100000, 500000]

for tstep in tstep_list:
    final_result = []
    summary_fileName = summary_fileName_model[:-5] + str(tstep) + ".out"
    for modelNo in range(REPEAT_NO):
        profit_list = []
        act_profit_list = []
        detail_list = []
        model = PPO2(MlpPolicy,
                     trainEnv,
                     verbose=1,
                     tensorboard_log="./" + SAVE_DIR[-3:] + '_' + str(tstep) +
                     "_tensorboard/")
        model.learn(total_timesteps=tstep, log_interval=128)
        # model.learn(total_timesteps=tstep)
        model_name = common_fileName_prefix + str(tstep) + '-' + str(
            modelNo) + "-model.model"
        model.save(path.join(SAVE_DIR, model_name), cloudpickle=True)

        obs = testEnv.reset()

        # Test for consecutive 2000 days
        for testNo in range(365 * 5):
            action, _states = model.predict(obs)
            if np.isnan(action).any():
                print(testNo)
Ejemplo n.º 21
0
stiffness_value = "stiffness_test16"
save_name_extension = RL_method

log_dir = "./logs/{}/{}/{}/".format(experiment_ID, RL_method, stiffness_value)



# defining the environments
env = gym.make('NmiLeg-v1')
#env = DummyVecEnv([lambda: env])

# loading the trained model
if RL_method == "PPO1":
	model = PPO1.load(log_dir+"/model.pkl")
elif RL_method == "PPO2":
	model = PPO2.load(log_dir+"/model.pkl")
	env = DummyVecEnv([lambda: env])
elif RL_method == "DDPG":
	model = DDPG.load(log_dir+"/model.pkl")
	env = DummyVecEnv([lambda: env])
else:
	raise ValueError("Invalid RL mode")
# setting the environment

model.set_env(env)

env_run = gym.make('NmiLeg-v1')
#env_run = Monitor(env_run,'./video/'+log_dir,force=True)
#model = DDPG.load("PPO2-HalfCheetah_nssu-v3_test2")
obs = env_run.reset()
#while True:
Ejemplo n.º 22
0
    def train(self):
        if not self.train_df:
            self.logger.info("Running built-in data preparation")
            self.prepare_data()
        else:
            self.logger.info("Using provided data (Length: %d)" %
                             len(self.train_df))

        study_name = 'ppo2_' + self.reward_strategy

        study = optuna.load_study(study_name=study_name,
                                  storage=self.params_db_file)
        params = study.best_trial.params

        train_env = DummyVecEnv([
            lambda: BitcoinTradingEnv(self.train_df,
                                      reward_func=self.reward_strategy,
                                      forecast_len=int(params['forecast_len']),
                                      confidence_interval=params[
                                          'confidence_interval'])
        ])

        test_env = DummyVecEnv([
            lambda: BitcoinTradingEnv(self.test_df,
                                      reward_func=self.reward_strategy,
                                      forecast_len=int(params['forecast_len']),
                                      confidence_interval=params[
                                          'confidence_interval'])
        ])

        model_params = self.model_params(params)

        model = PPO2(MlpLnLstmPolicy,
                     train_env,
                     verbose=0,
                     nminibatches=1,
                     tensorboard_log=os.path.join('.', 'tensorboard'),
                     **model_params)

        models_to_train = 1
        self.logger.info("Training {} model instances".format(models_to_train))

        for idx in range(
                0, models_to_train):  #Not sure why we are doing this, tbh
            self.logger.info('[', idx, '] Training for: ', len(self.train_df),
                             ' time steps')

            model.learn(total_timesteps=len(self.train_df))

            obs = test_env.reset()
            done, reward_sum = False, 0

            while not done:
                action, _states = model.predict(obs)
                obs, reward, done, info = test_env.step(action)
                reward_sum += reward

            self.logger.info('[', idx, '] Total reward: ', reward_sum,
                             ' (' + self.reward_strategy + ')')
            model.save(
                os.path.join(
                    '.', 'agents',
                    'ppo2_' + self.reward_strategy + '_' + str(idx) + '.pkl'))

        self.logger.info("Trained {} model instances".format(models_to_train))
Ejemplo n.º 23
0
    print('Saving the trained model!')
    model.save(save_path)
    # dump the flow params
    with open(os.path.join(path, args.result_name) + '.json', 'w') as outfile:
        json.dump(flow_params,
                  outfile,
                  cls=FlowParamsEncoder,
                  sort_keys=True,
                  indent=4)
    del model
    del flow_params

    # Replay the result by loading the model
    print('Loading the trained model and testing it out!')
    model = PPO2.load(save_path)
    flow_params = get_flow_params(
        os.path.join(path, args.result_name) + '.json')
    flow_params['sim'].render = True
    env_constructor = env_constructor(params=flow_params, version=0)()
    env = DummyVecEnv([
        lambda: env_constructor
    ])  # The algorithms require a vectorized environment to run
    obs = env.reset()
    reward = 0
    for i in range(flow_params['env'].horizon):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        reward += rewards
    print('the final reward is {}'.format(reward))
Ejemplo n.º 24
0
# storage='sqlite:///params.db')
# calmar_env = DummyVecEnv([lambda: BitcoinTradingEnv(
#    test_df, reward_func="profit", forecast_len=int(calmar_study.best_trial.params['forecast_len']), confidence_interval=calmar_study.best_trial.params['confidence_interval'])])

omega_study = optuna.load_study(study_name='ppo2_omega',
                                storage='sqlite:///params.db')
omega_env = DummyVecEnv([
    lambda: BitcoinTradingEnv(
        test_df,
        reward_func="profit",
        forecast_len=int(omega_study.best_trial.params['forecast_len']),
        confidence_interval=omega_study.best_trial.params['confidence_interval'
                                                          ])
])

profit_model = PPO2.load('./agents/ppo2_profit_4.pkl', env=profit_env)
sortino_model = PPO2.load('./agents/ppo2_sortino_4.pkl', env=sortino_env)
# calmar_model = PPO2.load('./agents/ppo2_calmar_4.pkl', env=calmar_env)
omega_model = PPO2.load('./agents/ppo2_omega_4.pkl', env=omega_env)

profit_obs = profit_env.reset()
sortino_obs = sortino_env.reset()
# calmar_obs = calmar_env.reset()
omega_obs = omega_env.reset()

profit_net_worths = [10000]
sortino_net_worths = [10000]
# calmar_net_worths = [10000]
omega_net_worths = [10000]

done = False
Ejemplo n.º 25
0
    'NovelGridworld-v3_200000_8beams0filled40range3items_in_360degrees_lfd_best_model',
    'NovelGridworld-v4_200000_8beams0filled40range3items_in_360degrees_lfd_best_model',
    'NovelGridworld-v3_200000_8beams0filled40range3items_in_360degrees_lfd_best_model'
]

assert len(env_key_list) == len(
    env_models), "Provide both: env_id and their models"

render = True

render_title = ''
env_dict = {env_id: {} for env_id in env_key_list}
# Load the trained agents
for i in range(len(env_key_list)):
    print("env_key_list[i]: ", env_key_list[i])
    env_dict[env_key_list[i]]['model'] = PPO2.load(env_models[i])
    render_title += env_key_list[i] + '_'
render_title = render_title[:-1]
render_title = 'NovelGridworld-v5'

# make 1st env
env_dict[env_key_list[0]]['env'] = gym.make(env_id_list[0])

for i_episode in range(10):
    # make 2nd env, 3rd env, ... nth env that can restore previous env
    for i in range(1, len(env_key_list)):
        env_dict[env_key_list[i]]['env'] = gym.make(
            env_id_list[i], env=env_dict[env_key_list[i - 1]]['env'])

    # Play trained env.
    for env_idx in range(len(env_key_list)):
Ejemplo n.º 26
0
      print(x[-1], 'timesteps')
      print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(best_mean_reward, mean_reward))

      # New best model, you could save the agent here
      if mean_reward > best_mean_reward:
          best_mean_reward = mean_reward
          # Example for saving best model
          print("Saving new best model")
          _locals['self'].save(log_dir + 'best_model.pkl')
  return True



if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--config', type=str, default=config_file)
    args = parser.parse_args()
    
    # Create log dir
    log_dir = "/tmp/gym/"
    os.makedirs(log_dir, exist_ok=True)

    env = HamstirGibsonEnv(config=args.config)
    env = Monitor(env, log_dir, allow_early_resets=True)
    env = DummyVecEnv([lambda: env])
    
    model = PPO2(CnnPolicy, env, verbose=1, gamma=0.95, n_steps=2000)
    
    # print(env.config)
    model.learn(total_timesteps=100000, callback=callback)
Ejemplo n.º 27
0
def env_create():
    env = ClientDapr("ActorOpenAI")
    env.create("LunarLander-v2")
    print(f"[Client] Created Sim {env.actor_id}", flush=True)

    return env


if __name__ == '__main__':
    print("===============================================", flush=True)
    print("TRAINING", flush=True)
    print("===============================================", flush=True)
    cpu = 50
    env = SubprocVecEnv([lambda: env_create() for _ in range(cpu)])

    model = PPO2(MlpPolicy,
                 env,
                 verbose=1,
                 tensorboard_log="./output/tensorboard")
    model.learn(total_timesteps=100000)
    print("[Client][Train] Saving Model", flush=True)
    model.save("baselines_ppo_cartpole")
    print("[Client][Train] DONE", flush=True)

    # Evaluate
    # mean_reward, std_reward = evaluate_policy(model, model.get_env()[0], n_eval_episodes=10)
    # print(f"Mean Reward: {mean_reward}; Std Reward: {std_reward}")
    # evaluate_actor_id = model.get_env()[0].actor_id
    # print(f"Env ID: {evaluate_actor_id}")
Ejemplo n.º 28
0
my_step_size = float(f_list[1])
my_maxspeed = float(f_list[2])
my_acceleration = 2.5 / 4
my_randomBall = True
my_binaryReward = True

# Initialize environment with signal parameters:
env = CustomEnv(step_limit=my_step_limit,
                step_size=my_step_size,
                maxspeed=my_maxspeed,
                acceleration=my_acceleration,
                randomBall=my_randomBall,
                binaryReward=my_binaryReward)  # 0.01745*5

# Load trained model and execute it forever:
model = PPO2.load("../Models/" + filename)

while True:
    #obs = env.reset()
    obs = env.reset()
    #obs = obs.reshape((1,4))
    #print(env.observation_space.shape)
    #obs, rewards, dones, info = env.step([0,0])
    for i in range(my_step_limit):  #my_step_limit
        action, _states = model.predict(obs)
        print(action)
        obs, rewards, dones, info = env.step(action)
        #obs = np.array(obs).reshape((1,4))
        env.renderSlow(50)
        if (dones):
            env.renderSlow(1)
Ejemplo n.º 29
0
def set_seed(rand_seed):
    set_global_seeds(100)
    env.env_method('seed', rand_seed)
    np.random.seed(rand_seed)
    os.environ['PYTHONHASHSEED'] = str(rand_seed)
    model.set_random_seed(rand_seed)


x = 0.5

env = gym.make('offload-autoscale-v0', p_coeff=x)
# Optional: PPO2 requires a vectorized environment to run
# the env is now wrapped automatically when passing it to the constructor
env = DummyVecEnv([lambda: env])
rand_seed = 1234
model = PPO2(MlpPolicy, env, verbose=1, seed=rand_seed)
model.learn(total_timesteps=1000)

rewards_list_ppo = []
avg_rewards_ppo = []
rewards_time_list_ppo = []
avg_rewards_time_list_ppo = []
rewards_bak_list_ppo = []
avg_rewards_bak_list_ppo = []
rewards_bat_list_ppo = []
avg_rewards_bat_list_ppo = []
avg_rewards_energy_list_ppo = []
ppo_data = []

rewards_list_random = []
avg_rewards_random = []
Ejemplo n.º 30
0
import gym_real
import numpy as np
import matplotlib.pyplot as plt
import datetime
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import SubprocVecEnv, DummyVecEnv
from stable_baselines import PPO2
from stable_baselines.bench import Monitor
from stable_baselines.results_plotter import load_results, ts2xy

if __name__ == "__main__":
    env_name = str(sys.argv[1])
    file_name = str(sys.argv[2])

    if file_name[:3] == "mod":
        model_name = file_name
    else:
        dirpath = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                               "models")
        log_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                               "tmp")
        model_name = os.path.join(dirpath, file_name)

    env = gym.make(env_name)
    model = PPO2.load(model_name)

    obs = env.reset()
    for i in range(10000):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render()