def main():
    """
    # Example with a simple Dummy vec env
    env = gym.envs.make('panda-ip-reach-v0', renders= True)
    env = DummyVecEnv([lambda: env])
    """
    print("Env created !")

    env = PandaReachGymEnv(renders=True)

    env.render(mode='rgb_array')

    model = DDPG.load("ddpg_panda_reach")
    print("model loaded !")

    while True:
       obs, done = env.reset(), False
       print("===================================")
       print("obs")
       print(obs)
       episode_rew = 0
       while not done:
          env.render(mode='rgb_array')
          action, _states = model.predict(obs)
          obs, rew, done, info = env.step(action)
          episode_rew += rew
       print("Episode reward", episode_rew)
Esempio n. 2
0
def train_stable_baselines(submodule, flags):
    """Train policies using the PPO algorithm in stable-baselines."""
    from stable_baselines3.common.vec_env import DummyVecEnv

    flow_params = submodule.flow_params
    # Path to the saved files
    exp_tag = flow_params['exp_tag']
    result_name = '{}/{}'.format(exp_tag, strftime("%Y-%m-%d-%H:%M:%S"))

    # Perform training.
    start_time = timeit.default_timer()
    # print experiment.json information
    print("=========================================")
    print('Beginning training.')
    print('Algorithm :', flags.algorithm)
    model = run_model_stablebaseline(flow_params, flags.num_cpus,
                                     flags.rollout_size, flags.num_steps,
                                     flags.algorithm, flags.exp_config)

    stop_time = timeit.default_timer()
    run_time = stop_time - start_time
    print("Training is Finished")
    print("total runtime: ", run_time)
    # Save the model to a desired folder and then delete it to demonstrate
    # loading.
    print('Saving the trained model!')
    path = os.path.realpath(os.path.expanduser('~/baseline_results'))
    ensure_dir(path)
    save_path = os.path.join(path, result_name)
    model.save(save_path)

    # dump the flow params
    with open(os.path.join(path, result_name) + '.json', 'w') as outfile:
        json.dump(flow_params,
                  outfile,
                  cls=FlowParamsEncoder,
                  sort_keys=True,
                  indent=4)

    # Replay the result by loading the model
    print('Loading the trained model and testing it out!')
    if flags.exp_config.lower() == "ppo":
        from stable_baselines3 import PPO
        model = PPO.load(save_path)
    elif flags.exp_config.lower() == "ddpg":
        from stable_baselines3 import DDPG
        model = DDPG.load(save_path)
    flow_params = get_flow_params(os.path.join(path, result_name) + '.json')
    flow_params['sim'].render = True
    env = env_constructor(params=flow_params, version=0)()
    # The algorithms require a vectorized environment to run
    eval_env = DummyVecEnv([lambda: env])
    obs = eval_env.reset()
    reward = 0
    for _ in range(flow_params['env'].horizon):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = eval_env.step(action)
        reward += rewards
    print('the final reward is {}'.format(reward))
Esempio n. 3
0
 def load_model(env, algorithm, filename):
     if algorithm == "ddpg":
         return DDPG.load(filename, env=env)
     elif algorithm == "td3":
         return TD3.load(filename, env=env)
     elif algorithm == "sac":
         return SAC.load(filename, env=env)
     else:
         raise Exception("--> Alican's LOG: Unknown agent type!")
Esempio n. 4
0
def test(MODEL_TEST):
    log_dir = "model_save/" + MODEL_PATH + "/" + MODEL_PATH + MODEL_TEST

    env = ENV(util='test', par=PARAM, dt=DT)
    env.render = True
    env = Monitor(env, log_dir)

    if PARAM['algo']=='td3':
        model = TD3.load(log_dir)
    elif PARAM['algo']=='ddpg':
        model = DDPG.load(log_dir)
    elif PARAM['algo']=='ppo':
        model = PPO.load(log_dir)

    # plot_results(f"model_save/")
    trade_dt = pd.DataFrame([])     # trade_dt: 所有股票的交易数据
    result_dt = pd.DataFrame([])    # result_dt: 所有股票一年测试结果数据
    for i in range(TEST_STOCK_NUM):
        state = env.reset()
        stock_bh_id = 'stock_bh_'+str(i)            # 记录每个股票交易的buy_hold
        stock_port_id = 'stock_port_'+str(i)        # 记录每个股票交易的portfolio
        stock_action_id = 'stock_action_' + str(i)  # 记录每个股票交易的action
        flow_L_id = 'stock_flow_' + str(i)          # 记录每个股票的流水
        stock_bh_dt, stock_port_dt, action_policy_dt, flow_L_dt = [], [], [], []
        day = 0
        while True:
            action = model.predict(state)
            next_state, reward, done, info = env.step(action[0])
            state = next_state
            # print("trying:",day,"reward:", reward,"now profit:",env.profit)   # 测试每一步的交易policy
            stock_bh_dt.append(env.buy_hold)
            stock_port_dt.append(env.Portfolio_unit)
            action_policy_dt.append(action[0][0])  # 用于记录policy
            flow_L_dt.append(env.flow)
            day+=1
            if done:
                print('stock: {}, total profit: {:.2f}%, buy hold: {:.2f}%, sp: {:.4f}, mdd: {:.2f}%, romad: {:.4f}'
                      .format(i, env.profit*100, env.buy_hold*100, env.sp, env.mdd*100, env.romad))
                # 交易完后记录:股票ID,利润(单位100%),buy_hold(单位100%),夏普率,最大回撤率(单位100%),romad
                result=pd.DataFrame([[i,env.profit*100,env.buy_hold*100,env.sp,env.mdd*100,env.romad]])
                break

        trade_dt_stock = pd.DataFrame({stock_port_id: stock_port_dt,
                                       stock_bh_id: stock_bh_dt,
                                       stock_action_id: action_policy_dt,
                                       flow_L_id: flow_L_dt})  # 支股票的交易数据

        trade_dt = pd.concat([trade_dt, trade_dt_stock], axis=1)    # 所有股票交易数据合并(加行)
        result_dt = pd.concat([result_dt,result],axis=0)            # 所有股票结果数据合并(加列)

    result_dt.columns = ['stock_id','prfit(100%)','buy_hold(100%)','sp','mdd(100%)','romad']
    trade_dt.to_csv('out_dt/trade_'+MODEL_PATH+'.csv',index=False)
    result_dt.to_csv('out_dt/result_'+MODEL_PATH+'.csv',index=False)
Esempio n. 5
0
def main():
    args = parse_arguments()
    load_path = os.path.join("logs", args.env, args.agent, "best_model.zip")
    stats_path = os.path.join(args.log_dir, args.env, args.agent, "vec_normalize.pkl")

    if args.agent == 'ddpg':
        from stable_baselines3 import DDPG
        model = DDPG.load(load_path)
    elif args.agent == 'td3':
        from stable_baselines3 import TD3
        model = TD3.load(load_path)
    elif args.agent == 'ppo':
        from stable_baselines3 import PPO
        model = PPO.load(load_path)

    env = make_vec_env(args.env, n_envs=1)
    env = VecNormalize.load(stats_path, env)
    #  do not update them at test time
    env.training = False
    # reward normalization is not needed at test time
    env.norm_reward = False
    
    # env = gym.make(args.env)
    img = []
    if args.render:
        env.render('human')
    done = False
    obs = env.reset()
    action = model.predict(obs)
    if args.gif:
        img.append(env.render('rgb_array'))

    if args.timesteps is None:
        while not done: 
            action, _= model.predict(obs)
            obs, reward, done, info = env.step(action)
            if args.gif:
                img.append(env.render('rgb_array'))
            else:
                env.render()
    else:
        for i in range(args.timesteps): 
            action, _= model.predict(obs)
            obs, reward, done, info = env.step(action)
            if args.gif:
                img.append(env.render('rgb_array'))
            else:
                env.render()

    if args.gif:
        imageio.mimsave(f'{os.path.join("logs", args.env, args.agent, "recording.gif")}', [np.array(img) for i, img in enumerate(img) if i%2 == 0], fps=29)
Esempio n. 6
0
def test_ddpg():
    log_dir = f"model_save/best_model_ddpg_cnn"
    env = ENV(istest=True)
    env.render = True
    env = Monitor(env, log_dir)
    model = DDPG.load(log_dir)
    plot_results(f"model_save/")
    for i in range(10):
        state = env.reset()
        while True:
            action = model.predict(state)
            next_state, reward, done, info = env.step(action[0])
            state = next_state
            # print("trying:",i,"action:", action,"now profit:",env.profit)
            if done:
                print('stock',i,' total profit=',env.profit,' buy hold=',env.buy_hold)
                break
Esempio n. 7
0
def run(env, algname, filename):
    if algname == "TD3":
        model = TD3.load(f"{algname}_pkl")
    elif algname == "SAC":
        if filename:
            model = SAC.load(f"{filename}")
        else:
            model = SAC.load(f"{algname}_pkl")
    elif algname == "DDPG":
        model = DDPG.load(f"{algname}_pkl")
    else:
        raise "Wrong algorithm name provided."

    obs = env.reset()
    while True:
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        env.render()
        if done:
            break
Esempio n. 8
0
def test_ddpg():
    log_dir = f"model_save/best_model_ddpg_sp2"
    env = ENV(istest=True)
    env.render = True
    env = Monitor(env, log_dir)
    model = DDPG.load(log_dir)
    plot_results(f"model_save/")
    for i in range(10):
        state = env.reset()
        day = 0
        while True:
            action = model.predict(state)
            next_state, reward, done, info = env.step(action[0])
            state = next_state
            # print("trying:",day,"reward:", reward,"now profit:",env.profit)
            day += 1
            if done:
                print(
                    'stock: {}, total profit: {:.2f}%, buy hold: {:.2f}%, sp: {:.4f}, mdd: {:.2f}%, romad: {:.4f}'
                    .format(i, env.profit * 100, env.buy_hold * 100, env.sp,
                            env.mdd * 100, env.romad))
                break
Esempio n. 9
0
import os
import time
import pdb
import math
import numpy as np
import pybullet as p
import gym
from gym import error, spaces, utils
from gym.utils import seeding
from stable_baselines3 import DDPG
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.evaluation import evaluate_policy

from gym_pybullet_drones.envs.RLTetherAviary import RLTetherAviary

if __name__ == "__main__":

    #### Check the environment's spaces ################################################################
    env = RLTetherAviary(gui=1, record=1)

    # env.USE_GUI_RPM = True

    model_name = "<path-to-model>"
    model = DDPG.load(model_name)

    obs = env.reset()

    for i in range(1000):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        # env.render()
Esempio n. 10
0
action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                            sigma=float(0.5) *
                                            np.ones(n_actions))

# model = DDPG(MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise)
model = DDPG(MlpPolicy, env, verbose=1, action_noise=action_noise)
model.learn(total_timesteps=400000)
model.save(
    "/home/nightmareforev/git/bullet_stuff/multi_kuka_sim/kukakr5Arc/envs/saved_policies/kukakr5Arc_reacher"
)
print('Saving model.... Model saved')

del model  # remove to demonstrate saving and loading

model = DDPG.load(
    "/home/nightmareforev/git/bullet_stuff/multi_kuka_sim/kukakr5Arc/envs/saved_policies/kukakr5Arc_reacher",
    env=env)
print('Loading model.....Model loaded')

#env.render() goes before env.reset() for the render to work
#env.render()

#obs = env.reset()
#while True:
#    action, _states = model.predict(obs)
#    obs, rewards, dones, info = env.step(action)
#    env.render()

while True:
    print("running saved policy")
    obs = env.reset()
Esempio n. 11
0
    env = gym.make('gym_spm:spm-v0')
    # env = make_vec_env(env_id, n_envs=1, seed=0)
    # env = VecCheckNan(env, raise_exception=True)
    # env = check_env(env)

    # The noise objects for DDPG
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=25.67 * np.ones(n_actions))

    # model = TD3(MlpPolicy, env, action_noise=action_noise, verbose=1, tensorboard_log="./TD3_spm_v2_SOC_point5_two_state/")
    model = DDPG(MlpPolicy, env, action_noise=action_noise, verbose=1, tensorboard_log="./DDPG_spm_v2_SOC_point5_two_state/")
    model.learn(total_timesteps=25000, tb_log_name='DDPG_test_run_3_SOCpoint5_two_state')
    model.save('DDPG_test_3_SOC_point5_two_states')


    model.load('DDPG_test_2_SOC_point5_two_states')
    mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
    
    print("Mean Reward = ", mean_reward)
    
    epsi_sp_list = []
    action_list = []
    soc_list = []
    Concentration_list = []
    Concentration_list1 = []
    
    obs = env.reset()
    for _ in range(3600):
    
        action, _states = model.predict(obs, deterministic=True)
        obs, rewards, done, info = env.step(action)
Esempio n. 12
0
 def load_weights(self, weights_file):
     """ Load the model from a zip archive """
     logger.info(f"load weight from file: {weights_file}")
     self.model = DDPG.load(weights_file, env=self.env)
     pass
Esempio n. 13
0
    # Instantiate Model
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=25.67 * np.ones(n_actions))
    model = DDPG(MlpPolicy,
                 env,
                 action_noise=action_noise,
                 verbose=1,
                 tensorboard_log=log_dir)

    # Train OR Load Model
    if train_model:
        model.learn(total_timesteps=25000, tb_log_name=details)
        model.save(model_dir_description)
    else:
        model.load(model_dir_description)

    mean_reward, std_reward = evaluate_policy(model,
                                              model.get_env(),
                                              n_eval_episodes=10)

    print("Mean Reward = ", mean_reward)

    epsi_sp_list = []
    action_list = []
    soc_list = []
    Concentration_list = []
    Concentration_list1 = []

    obs = env.reset()
    for _ in range(3600):
Esempio n. 14
0
# If a model is from a bucket, download model
if 'gs://' in args.model:
    # Download from given bucket (gcloud configured with privileges)
    client = gcloud.init_storage_client()
    bucket_name = args.model.split('/')[2]
    model_path = args.model.split(bucket_name + '/')[-1]
    gcloud.read_from_bucket(client, bucket_name, model_path)
    model_path = './' + model_path
else:
    model_path = args.model

model = None
if args.algorithm == 'DQN':
    model = DQN.load(model_path)
elif args.algorithm == 'DDPG':
    model = DDPG.load(model_path)
elif args.algorithm == 'A2C':
    model = A2C.load(model_path)
elif args.algorithm == 'PPO':
    model = PPO.load(model_path)
elif args.algorithm == 'SAC':
    model = SAC.load(model_path)
elif args.algorithm == 'TD3':
    model = TD3.load(model_path)
else:
    raise RuntimeError('Algorithm specified is not registered.')

# ---------------------------------------------------------------------------- #
#                             Execute loaded agent                             #
# ---------------------------------------------------------------------------- #
for i in range(args.episodes):
Esempio n. 15
0
import gym
import numpy as np

from stable_baselines3 import DDPG
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise

env = gym.make('MountainCarContinuous-v0')

# The noise objects for DDPG
n_actions = env.action_space.shape[-1]
action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                 sigma=0.1 * np.ones(n_actions))

model = DDPG('MlpPolicy', env, action_noise=action_noise, verbose=1)
model.learn(total_timesteps=1000000, log_interval=10)
model.save("ddpg_pendulum")
env = model.get_env()

del model  # remove to demonstrate saving and loading

model = DDPG.load("ddpg_pendulum")

obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
#
input_path = "./Model/DDGP_1.pt"
#
policy_path = "./Model/Policy.pt"
#
# value_func_path = "./Model/Value_Func.pt"
#
# n_actions = env.action_space.shape[-1]
# action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=.75 * np.ones(n_actions))
#
# model = DDPG(MlpPolicy, env, action_noise=action_noise, verbose=1)
#
#
# model.load(input_path)

model = DDPG.load("./Model/DDGP_1.pt")

# torch.save(model, "./Model/Full_Model.pt")

model.policy.save(policy_path)

dict_thing = model.policy.state_dict()

x = torch.randn(2)

print(x.shape)
print(x)

print(model.actor)

# print(dict_thing)
Esempio n. 17
0
#!/usr/bin/env python
import gym
from stable_baselines3 import PPO
from pathlib import Path
from util import Evaluate, plot2, plot3, plot_picture

env = gym.make("reference_environment:reference-environment-v0")

# agent = PPO.load("MODEL_0.zip")
from stable_baselines3 import DDPG

agent = DDPG.load("MODEL_ALPHA_GENERATION.zip")

evaluate = Evaluate(env, agent)
seeds = evaluate.read_seeds(fname="seeds.csv")
# mean_reward = evaluate.RL_agent(seeds) # Add your agent to the Evaluate class and call it here e.g. evaluate.my_agent(seeds)
# mean_reward = evaluate.matching_agent(seeds) # Add your agent to the Evaluate class and call it here e.g. evaluate.my_agent(seeds)
# mean_reward = evaluate.min_agent(seeds) # Add your agent to the Evaluate class and call it here e.g. evaluate.my_agent(seeds)

mean_reward = evaluate.transformed_agent(seeds, H=7, transform="Standard")

# ### Plot the last episode
# plot2(env.state, "fixed_policy")
# plot3(env.state, "fixed_policy.png")
#
#
# ## Plot the last episode
# plot2(env.state, "fixed_policy_h=4")
# plot3(env.state, "fixed_policy_h=4.png")

plot_picture(env.state, "fixed_policy")
Esempio n. 18
0
        model_path = ''
        if 'gs://' in args.model:
            # Download from given bucket (gcloud configured with privileges)
            client = gcloud.init_storage_client()
            bucket_name = args.model.split('/')[2]
            model_path = args.model.split(bucket_name + '/')[-1]
            gcloud.read_from_bucket(client, bucket_name, model_path)
            model_path = './' + model_path
        else:
            model_path = args.model

        model = None
        if args.algorithm == 'DQN':
            model = DQN.load(model_path, tensorboard_log=args.tensorboard)
        elif args.algorithm == 'DDPG':
            model = DDPG.load(model_path, tensorboard_log=args.tensorboard)
        elif args.algorithm == 'A2C':
            model = A2C.load(model_path, tensorboard_log=args.tensorboard)
        elif args.algorithm == 'PPO':
            model = PPO.load(model_path, tensorboard_log=args.tensorboard)
        elif args.algorithm == 'SAC':
            model = SAC.load(model_path, tensorboard_log=args.tensorboard)
        elif args.algorithm == 'TD3':
            model = TD3.load(model_path, tensorboard_log=args.tensorboard)
        else:
            raise RuntimeError('Algorithm specified is not registered.')

        model.set_env(env)

    # ---------------------------------------------------------------------------- #
    #       Calculating total training timesteps based on number of episodes       #
Esempio n. 19
0
client = Client(remote_base)

env_id = "reference-environment-v0"
seed = int(os.getenv("RANGL_SEED", 123456))

instance_id = client.env_create(env_id, seed)

client.env_monitor_start(
    instance_id,
    directory=f"monitor/{instance_id}",
    force=True,
    resume=False,
    video_callable=False,
)

model = DDPG.load("MODEL_ALPHA_GENERATION.zip")

observation = client.env_reset(instance_id)

print(observation)

import numpy as np


def ObservationTransform(obs, H, transform, steps_per_episode=int(96)):
    step_count, generator_1_level, generator_2_level = obs[:3]
    agent_prediction = np.array(obs[3:])

    agent_horizon_prediction = agent_prediction[-1] * np.ones(
        steps_per_episode)
    agent_horizon_prediction[:int(steps_per_episode -
Esempio n. 20
0
    if os.path.isfile(ARGS.exp + '/success_model.zip'):
        path = ARGS.exp + '/success_model.zip'
    elif os.path.isfile(ARGS.exp + '/best_model.zip'):
        path = ARGS.exp + '/best_model.zip'
    else:
        print("[ERROR]: no model under the specified path", ARGS.exp)
    if algo == 'a2c':
        model = A2C.load(path)
    if algo == 'ppo':
        model = PPO.load(path)
    if algo == 'sac':
        model = SAC.load(path)
    if algo == 'td3':
        model = TD3.load(path)
    if algo == 'ddpg':
        model = DDPG.load(path)

    #### Parameters to recreate the environment ################
    env_name = ARGS.exp.split("-")[1] + "-aviary-v0"
    OBS = ObservationType.KIN if ARGS.exp.split(
        "-")[3] == 'kin' else ObservationType.RGB
    if ARGS.exp.split("-")[4] == 'rpm':
        ACT = ActionType.RPM
    elif ARGS.exp.split("-")[4] == 'dyn':
        ACT = ActionType.DYN
    elif ARGS.exp.split("-")[4] == 'pid':
        ACT = ActionType.PID
    elif ARGS.exp.split("-")[4] == 'vel':
        ACT = ActionType.VEL
    elif ARGS.exp.split("-")[4] == 'tun':
        ACT = ActionType.TUN
    train = True

    env = gym.make('gym_spm:spm-v0')

    # The noise objects for DDPG
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=25.67 * np.ones(n_actions))
    model = DDPG(MlpPolicy, env, action_noise=action_noise, verbose=1, tensorboard_log="./DDPG_spm_v2_SOC_point5_two_state/")
    # model = TD3(MlpPolicy, env, action_noise=action_noise, verbose=1, tensorboard_log="./TD3_spm_v2_SOC_point5_two_state/")

    if train:
        model.learn(total_timesteps=2500000, tb_log_name='test_run_3_SOCpoint5_two_state')
        model.save('TD3_test_3_SOC_point5_two_states')
    else:
        model.load('TD3_test_2_SOC_point5_two_states')

    mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
    
    print("Mean Reward = ", mean_reward)
    
    epsi_sp_list = []
    action_list = []
    soc_list = []
    Concentration_list = []
    Concentration_list1 = []
    
    obs = env.reset()
    for _ in range(3600):
    
        action, _states = model.predict(obs, deterministic=True)