コード例 #1
0
def test_identity_ddpg():
    """
    Test if the algorithm (with a given policy)
    can learn an identity transformation (i.e. return observation as an action)
    """
    env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)])

    std = 0.2
    param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(std),
                                         desired_action_stddev=float(std))

    model = DDPG("MlpPolicy",
                 env,
                 gamma=0.0,
                 param_noise=param_noise,
                 memory_limit=int(1e6))
    model.learn(total_timesteps=20000, seed=0)

    n_trials = 1000
    reward_sum = 0
    set_global_seeds(0)
    obs = env.reset()
    for _ in range(n_trials):
        action, _ = model.predict(obs)
        obs, reward, _, _ = env.step(action)
        reward_sum += reward
    assert reward_sum > 0.9 * n_trials
    # Free memory
    del model, env
コード例 #2
0
def DDPGAgent(multi_stock_env, num_episodes):
    models_folder = 'saved_models'
    rewards_folder = 'saved_rewards'

    env = DummyVecEnv([lambda: multi_stock_env])
    
    # the noise objects for DDPG
    n_actions = env.action_space.shape[-1]
    param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
    
    # Hyper parameters
    GAMMA = 0.99
    TAU = 0.001
    BATCH_SIZE = 16
    ACTOR_LEARNING_RATE = 0.0001
    CRITIC_LEARNING_RATE = 0.001
    BUFFER_SIZE = 500

    print("\nRunning DDPG Agent...\n")
    model = DDPG(MlpPolicy, env, 
                gamma = GAMMA, tau = TAU, batch_size = BATCH_SIZE,
                actor_lr = ACTOR_LEARNING_RATE, critic_lr = CRITIC_LEARNING_RATE,
                buffer_size = BUFFER_SIZE, verbose=1, 
                param_noise=param_noise, action_noise=action_noise)
    model.learn(total_timesteps=50000)
    model.save(f'{models_folder}/rl/ddpg.h5')

    del model
    
    model = DDPG.load(f'{models_folder}/rl/ddpg.h5')
    obs = env.reset()
    portfolio_value = []

    for e in range(num_episodes):
        action, _states = model.predict(obs)
        next_state, reward, done, info = env.step(action)
        print(f"episode: {e + 1}/{num_episodes}, episode end value: {info[0]['cur_val']:.2f}")
        portfolio_value.append(round(info[0]['cur_val'], 3))

    # save portfolio value for each episode
    np.save(f'{rewards_folder}/rl/ddpg.npy', portfolio_value)

    print("\nDDPG Agent run complete and saved!")

    a = np.load(f'./saved_rewards/rl/ddpg.npy')

    print(f"\nCumulative Portfolio Value Average reward: {a.mean():.2f}, Min: {a.min():.2f}, Max: {a.max():.2f}")
    plt.plot(a)
    plt.title("Portfolio Value Per Episode (DDPG)")
    plt.ylabel("Portfolio Value")
    plt.xlabel("Episodes")
    plt.show()
コード例 #3
0
    def __call__(self, trial):
        # Calculate an objective value by using the extra arguments.
        env_id = 'gym_custom:fooCont-v0'
        env = gym.make(env_id, data=self.train_data)
        env = DummyVecEnv([lambda: env])
        algo = trial.suggest_categorical('algo', ['TD3'])
        model = 0
        if algo == 'PPO2':

            policy_choice = trial.suggest_categorical('policy', [False, True])
            policy = commonMlp if policy_choice else commonMlpLstm
            model_params = optimize_ppo2(trial)

            model = PPO2(policy, env, verbose=0, nminibatches=1, **model_params)
            model.learn(276*7000)

        elif algo == 'DDPG':
            policy_choice = trial.suggest_categorical('policy', [False, True])
            policy = ddpgLnMlp
            model_params = sample_ddpg_params(trial)

            model= DDPG(policy, env, verbose=0, **model_params)
            model.learn(276*7000)

        elif algo == 'TD3':
            policy_choice = trial.suggest_categorical('policy', [False, True])
            policy = td3MLP if policy_choice else td3LnMlp
            model_params = sample_td3_params(trial)

            model = TD3(policy, env, verbose=0, **model_params)
            model.learn(276*7000*3)

        rewards = []
        reward_sum = 0.0
        env = gym.make(env_id, data=self.test_data)
        env = DummyVecEnv([lambda: env])

        obs = env.reset()
        for ep in range(1000):
            for step in range(276):
                action, _ = model.predict(obs)
                obs, reward, done, _ = env.step(action)
                reward_sum += reward

                if done:
                   rewards.append(reward_sum)
                    reward_sum = 0.0
                    obs = env.reset()
コード例 #4
0
def train_identity_ddpg():
    env = DummyVecEnv([lambda: IdentityEnvBox(eps = 0.5)])
    std = 0.2

    param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(std), desired_action_stddev=float(std))
    model = DDPG("MlpPolicy", env, gamma=0.0, param_noise=param_noise, memory_limit=int(1e6))
    model.learn(total_timesteps=20000, seed=0)

    n_trials = 1000
    reward_sum = 0
    set_global_seeds(0)
    obs = env.reset()
    for _ in range(n_trials):
        action, _ = model.predict(obs)
        obs, reward, _, _ = env.step(action)
        reward_sum += reward
    assert reward_sum > 0.9 * n_trials

    del model, env
コード例 #5
0
def run_baseline_ddpg(env_name, train=True):
    import numpy as np
    # from stable_baselines.ddpg.policies import MlpPolicy
    from stable_baselines.common.vec_env import DummyVecEnv
    from stable_baselines.ddpg.noise import OrnsteinUhlenbeckActionNoise
    from stable_baselines import DDPG

    env = gym.make(env_name)
    env = DummyVecEnv([lambda: env])

    if train:
        # mlp
        from stable_baselines.ddpg.policies import FeedForwardPolicy
        class CustomPolicy(FeedForwardPolicy):
            def __init__(self, *args, **kwargs):
                super(CustomPolicy, self).__init__(*args, **kwargs,
                                                layers=[64, 64, 64],
                                                layer_norm=True,
                                                feature_extraction="mlp")

        # the noise objects for DDPG
        n_actions = env.action_space.shape[-1]
        param_noise = None
        action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions)+0.15, sigma=0.3 * np.ones(n_actions))
        model = DDPG(CustomPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, 
            tau=0.01, observation_range=(env.observation_space.low, env.observation_space.high),
            critic_l2_reg=0, actor_lr=1e-3, critic_lr=1e-3, memory_limit=100000)
        model.learn(total_timesteps=1e5)
        model.save("checkpoints/ddpg_" + env_name)

    else:
        model = DDPG.load("checkpoints/ddpg_" + env_name)

        obs = env.reset()
        while True:
            action, _states = model.predict(obs)
            obs, rewards, dones, info = env.step(action)
            env.render()
            print("state: ", obs, " reward: ", rewards, " done: ", dones, "info: ", info)

    del model # remove to demonstrate saving and loading
コード例 #6
0
def optimize_agent(trial):
    """ Train the model and optimise
        Optuna maximises the negative log likelihood, so we
        need to negate the reward here
    """
    model_params = optimize_ddpg(trial)
    seed = trial.suggest_int('numpyseed', 1, 429496729)
    np.random.seed(seed)
    original_env = gym.make('rustyblocks-v0')
    original_env.max_invalid_tries = 3
    env = DummyVecEnv([lambda: original_env])
    model = DDPG("MlpPolicy", env, verbose=0, observation_range=(-126,126), **model_params)
    print("DOING LEARING a2c")
    original_env.force_progression = False
    model.learn(int(2e4*5), seed=seed)
    print("DONE LEARING a2c")
    original_env.max_invalid_tries = -1

    rewards = []
    n_episodes, reward_sum = 0, 0.0

    obs = env.reset()
    original_env.force_progression = True
    original_env.invalid_try_limit = 5000
    while n_episodes < 4:
        action, _ = model.predict(obs)
        obs, reward, done, _ = env.step(action)
        reward_sum += reward

        if done:
          rewards.append(reward_sum)
          reward_sum = 0.0
          n_episodes += 1
          obs = env.reset()

    last_reward = np.mean(rewards)
    trial.report(last_reward)

    return last_reward
コード例 #7
0
def DDPGgive_results(files, balance, shares=None):
    env = create_stock_env(files, train=False, balance=balance, shares=shares)
    max_steps = env.max_steps - env.num_prev
    env = DummyVecEnv([lambda: env])
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(0, 2)
    param_noise = AdaptiveParamNoiseSpec(initial_stddev=1,
                                         desired_action_stddev=0.1,
                                         adoption_coefficient=1.01)
    model = DDPG(CustomDDPGPolicy,
                 env,
                 verbose=0,
                 param_noise=param_noise,
                 action_noise=action_noise)

    # model = DDPG.load("/home/harshit/Documents/itsp-trade agent/Reinforcement-Learning-Stock-Trader/WebPortal/StockApp/Stock_stable.zip",env=env)
    model.learn(total_timesteps=100)
    profit = 0
    profitst = np.zeros((max_steps - 1, 2))
    actionst = np.zeros((n_actions // 2, max_steps - 1, 2))
    shares = np.zeros((len(files), max_steps - 1, 2))
    obs = env.reset()
    for i in range(max_steps):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        actionst[:, i, 1] = -info[0]['action'][0][0:n_actions // 2] + info[0][
            'action'][0][n_actions // 2:]
        actionst[:, i, 0] = i
        shares[:, i, 1] = info[0]['shares_held']
        shares[:, i, 0] = i
        #         print('a',action)
        profit += rewards
        profitst[i] = [i, profit]
        if dones:
            break
    print(info[0]['action'][0])
    print(actionst)
    return profitst.tolist(), shares.tolist(), actionst.tolist()
コード例 #8
0
def train_decision(config=None,
                   save=False,
                   load=False,
                   calender=None,
                   history=None,
                   predict_results_dict=None,
                   test_mode=False,
                   start_date=None,
                   stop_date=None,
                   episode_steps=1000,
                   model='DDPG'):
    """
    训练决策模型,从数据库读取数据并进行决策训练

    参数:
        config:配置文件, 
        save:保存结果, 
        calender:交易日日历, 
        history:行情信息, 
        all_quotes:拼接之后的行情信息
        predict_results_dict:预测结果信息
    """
    # 首先处理预测数据中字符串日期

    MODEL = model

    predict_dict = {}
    for k, v in predict_results_dict.items():
        assert isinstance(v['predict_date'].iloc[0], str)
        tmp = v['predict_date'].apply(
            lambda x: arrow.get(x, 'YYYY-MM-DD').date())
        predict_dict[k] = v.rename(index=tmp)

    env = Portfolio_Prediction_Env(config=config,
                                   calender=calender,
                                   stock_history=history,
                                   window_len=1,
                                   prediction_history=predict_dict,
                                   start_trade_date=start_date,
                                   stop_trade_date=stop_date,
                                   save=save)

    # 测试模式
    if test_mode:
        obs = env.reset()
        # check_env(env)
        for i in range(1000):
            W = np.random.uniform(0.0, 1.0, size=(6, ))
            offer = np.random.uniform(-10.0, 10.0, size=(6, ))
            obs, reward, done, infos = env.step(np.hstack((W, offer)))
            # env.render()
            if done:
                env.save_history()
                break
        env.close()

    # 训练模式
    if MODEL == "DDPG":
        # 添加噪声
        n_actions = env.action_space.shape
        param_noise = None
        # 适合于惯性系统控制的OU噪声
        action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                    sigma=float(0.5) *
                                                    np.ones(n_actions))

        model_path = search_file(
            os.path.join(sys.path[0], 'saved_models', MODEL), MODEL)
        if len(model_path) > 0 and load:
            model = DDPG.load(
                model_path[0],
                env=env,
                policy=CustomDDPGPolicy,
                param_noise=param_noise,
                action_noise=action_noise,
                # tensorboard_log='./tb_log',
            )
        else:
            model = DDPG(
                policy=CustomDDPGPolicy,
                env=env,
                verbose=1,
                param_noise=param_noise,
                action_noise=action_noise,
                # tensorboard_log='./tb_log',
            )
        # 训练步数
        model.learn(total_timesteps=episode_steps, )
        model.save(
            os.path.join(sys.path[0], 'saved_models', MODEL, MODEL + '.h5'))

    elif MODEL == 'TD3':
        n_actions = env.action_space.shape[-1]
        # 适合于惯性系统控制的OU噪声
        action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                    sigma=float(0.5) *
                                                    np.ones(n_actions))

        model_path = search_file(
            os.path.join(sys.path[0], 'saved_models', MODEL), MODEL)
        if len(model_path) > 0 and load:
            model = TD3.load(
                model_path[0],
                env=env,
                policy=CustomTD3Policy,
                action_noise=action_noise,
                # tensorboard_log='./tb_log',
            )
        else:
            model = TD3(
                policy=CustomTD3Policy,
                env=env,
                verbose=1,
                action_noise=action_noise,
                # tensorboard_log='./tb_log',
            )
        # 训练步数
        model.learn(total_timesteps=episode_steps, )
        model.save(
            os.path.join(sys.path[0], 'saved_models', MODEL, MODEL + '.h5'))

    elif MODEL == "HER":
        """
        env必须是GoalEnv
        """
        model_class = DDPG

        # Available strategies (cf paper): future, final, episode, random
        goal_selection_strategy = 'future'  # equivalent to GoalSelectionStrategy.FUTURE

        # Wrap the model
        model = HER(policy=CustomDDPGPolicy,
                    env=env,
                    model_class=model_class,
                    n_sampled_goal=4,
                    goal_selection_strategy=goal_selection_strategy,
                    verbose=1)
        model.learn(total_timesteps=episode_steps, )
        model.save(
            os.path.join(sys.path[0], 'saved_models', MODEL, MODEL + '.h5'))

    obs = env.reset()
    # 实测模式
    for i in range(1000):
        action, _states = model.predict(obs)
        obs, reward, done, info = env.step(action)
        # env.render(info=info)
        if done:
            if save:
                env.save_history()
            env.reset()
            break

    env.close()
コード例 #9
0
model = DDPG(MlpPolicy,
             env,
             verbose=1,
             param_noise=None,
             action_noise=action_noise)
# Train the model
model.learn(1000)

model.save("./hideandseek")

# WARNING: you must pass an env
# or wrap your environment with HERGoalEnvWrapper to use the predict method
model = HER.load('./hideandseek', env=env)

obs = env.reset()
for _ in range(100):
    action, _ = model.predict(obs)
    obs, reward, done, _ = env.step(action)

    if done:
        obs = env.reset()

#    print(main.__doc__)

#if __name__ == '__main__':
#    logging.getLogger('').handlers = []
#    logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)

#main()
コード例 #10
0
ファイル: DDPG_baselines.py プロジェクト: WHY-David/repo
#        The replay buffer is used to store experience, because DDPG is an off-policy algorithm.
#        A target network is designed to minimize MSBE loss.
#        A target policy network to compute an action which approximately maximizes Q_{\phi_{\text{targ}}}.
#        Ornstein-Uhlenbeck process is applied to add exploration noise during training to make DDPG policies explore better.

model = DDPG(MlpPolicy,
             env,
             verbose=1,
             tau=tau,
             gamma=gamma,
             batch_size=batch_size,
             actor_lr=alr,
             critic_lr=clr,
             param_noise=param_noise,
             action_noise=action_noise)

if __name__ == '__main__':
    # train
    model.learn(total_timesteps=10000)
    model.save("DDPG_baselines")

    # play
    env = OsmoEnv()
    for i in range(10):
        observation = env.reset()
        done = False
        while not done:
            action, _ = model.predict(observation)
            observation, reward, done, info = env.step(action)
            # print(reward)
        print(info)
コード例 #11
0
del model_3
del model_4

print(
    "*************************************\n        Model 1 Result        \n*************************************"
)

model = DDPG.load("ddpg_copter")
n_episode = 10
episode_reward = np.zeros(n_episode)
for i in range(n_episode):
    obs = env.reset()
    sum_reward = 0
    while True:

        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        sum_reward += rewards
        if dones:
            break
        env.render()
    episode_reward[i] = sum_reward

model_1_result = np.mean(episode_reward)

print(
    "*************************************\n        Model 2 Result        \n*************************************"
)

model_2 = DDPG.load("ddpg_copter_2")
n_episode = 10
コード例 #12
0
import random
import numpy as np

from stable_baselines.ddpg.policies import MlpPolicy
from stable_baselines import DDPG

env = gym.make('JetBot-v0')

model = DDPG(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=100)
model.save('ddpg_jetbot')

#model = DDPG.load('ddpg_jetbot')

episodes = 50
env.reset()

for episode in range(episodes):
    observation = env.reset()
    score = 0
    done = False
    while not done:
        action = model.predict(observation, deterministic=True)
        observation, reward, done, info = env.step(action)
        print('obs=', observation, ' | reward=', reward, ' | done=', done)
        score += reward
        if done:
            GPIO.cleanup()
            print("Episode ", episode + 1, "/", episodes,
                  " finished with a score of: ", score)
            break
コード例 #13
0
ファイル: ddpg_test.py プロジェクト: Deastan/rl
def main():
    # Params
    global train, predict, log_dir, total_timesteps

    # config = tf.ConfigProto()
    # config.gpu_options.allow_growth = True
    # config = tf.ConfigProto(allow_soft_placement=True)
    # sess = tf.Session(tf.ConfigProto(allow_soft_placement=True))
    # sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True))

    env = gym.make('MountainCarContinuous-v0')
    # env = DummyVecEnv([lambda: env])
    # Create and wrap the environment
    # env = gym.make('MountainCarContinuous-v0')
    # env = gym.make('FetchReach-v0')
    # env = DummyVecEnv([lambda: env])
    # Create and wrap the environment
    # env = gym.make('LunarLanderContinuous-v2')

    # env = Monitor(env, log_dir, allow_early_resets=True)
    env = DummyVecEnv([lambda: env])

    # Logs will be saved in log_dir/monitor.csv
    # env = Monitor(env, log_dir, allow_early_resets=True)

    if train == True:
        # the noise objects for DDPG
        n_actions = env.action_space.shape[-1]
        # print(n_actions)
        param_noise = None
        action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                    sigma=float(0.5) *
                                                    np.ones(n_actions))

        # tensorboard_log="/tmp/ddpg_MountainCarContinious_tensorboard/",
        model = DDPG(MlpPolicy,
                     env,
                     verbose=1,
                     param_noise=param_noise,
                     action_noise=action_noise,
                     render=True)
        # model = DDPG(MlpPolicy, env, verbose=1)
        # model.learn(total_timesteps=total_timesteps, callback=callback)
        model.learn(total_timesteps=total_timesteps)
        # model = PPO2(MlpPolicy, env, verbose=1)
        # model.learn(total_timesteps=total_timesteps)
        model.save("MountainCarContinuous")
        # model.save("ddpg_mountain")
        # model.save("ppo2_mountain")

        #Show we have finished with the learning
        print("Finisehd with learning")

        # plot_results(log_dir)
        del model  # remove to demonstrate saving and loading
    # End if

    if predict == True:
        # model = DDPG.load("ddpg_mountain_40000") # Best one!
        model = DDPG.load("MountainCarContinuous")
        # model = DDPG.load("ddpg_mountain")
        # model = PPO2.load("ppo2_mountain")

        obs = env.reset()
        while True:
            action, _states = model.predict(obs)
            obs, rewards, dones, info = env.step(action)
            env.render()
コード例 #14
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--generate_pretrain",
        type=int,
        default=0,
        help="If true, launch an interface to generate an expert trajectory")

    parser.add_argument(
        "--train",
        type=int,
        default=1,
        help="True: training, False: using a trained model")

    parser.add_argument(
        "--algo",
        type=str,
        default="ppo2",
        help="The learning algorithm to be used (ppo2 or ddpg)")

    parser.add_argument(
        "--model",
        type=str,
        default="",
        help="The version name of the model")

    parser.add_argument(
        "--gui",
        type=int,
        default=1,
        help="Wether the GUI of the simulation should be used or not. 0 or 1")

    args = parser.parse_args()
    algo = args.algo.lower()

    try:
        assert args.gui == 0 or args.gui == 1
        assert algo == "ppo2" or algo == "ddpg"

    except AssertionError as e:
        print(str(e))
        return

    env = RobotEnv(gui=args.gui)
    vec_env = DummyVecEnv([lambda: env])

    # Generate an expert trajectory
    if args.generate_pretrain:
        pass
    
    # Train a model
    elif args.train == 1:
        while True:
            req = Request(
                "https://frightanic.com/goodies_content/docker-names.php",
                headers={'User-Agent': 'Mozilla/5.0'})

            webpage = str(urlopen(req).read())
            word = webpage.split("b\'")[1]
            word = word.split("\\")[0]
            word.replace(" ", "_")

            try:
                assert os.path.isfile(
                    "models/" + algo + "_throw_" + word + ".pkl")

            except AssertionError:
                break

        log_name = "./logs/throw/" + word

        if algo == "ppo2":
            # For recurrent policies, nminibatches should be a multiple of the 
            # nb of env used in parallel (so for LSTM, 1)
            model = PPO2(
                MlpLstmPolicy,
                vec_env,
                nminibatches=1,
                verbose=0,
                tensorboard_log=log_name)

        elif algo == "ddpg":
            action_noise = OrnsteinUhlenbeckActionNoise(
                mean=np.zeros(env.action_space.shape[-1]),
                sigma=float(0.5) * np.ones(env.action_space.shape[-1]))

            model = DDPG(
                stable_baselines.ddpg.LnMlpPolicy,
                env,
                verbose=0,
                param_noise=None,
                action_noise=action_noise,
                tensorboard_log=log_name)

        try:
            model.learn(total_timesteps=1000000)
        
        except KeyboardInterrupt:
            print("#---------------------------------#")
            print("Training \'" + word + "\' interrupted")
            print("#---------------------------------#")
            sys.exit(1)

        model.save("models/" + algo + "_throw_" + word)

    # Use a trained model
    else:
        if args.model == "":
            print("Specify the version of the model using --model")
            return

        if algo == "ppo2":
            model = PPO2.load("models/" + algo + "_throw_" + args.model)
        elif algo == "ddpg":
            model = DDPG.load("models/" + algo + "_throw_" + args.model)

        for test in range(10):
            dones = False
            obs = env.reset()

            while not dones:
                action, _states = model.predict(obs)
                obs, rewards, dones, info = env.step(action)

    time.sleep(2)
    env._termination()
コード例 #15
0
        path = 'models/' + model_name + '.pkl'
    powermodel.save('models/' + model_name)
    with open('models/' + model_name + '_params.p', 'wb') as f:
        pickle.dump(env.params, f)


model_name = '800k_full'
path = 'models/' + model_name + '.pkl'
i = 2
while os.path.isfile(path):
    model_name += '_' + str(i)
    i += 1
    path = 'models/' + model_name + '.pkl'
powermodel.save('models/' + model_name)
with open('models/' + model_name + '_params.p', 'wb') as f:
    pickle.dump(env.params, f)

for i in range(100):
    action, _ = powermodel.predict(obs)
    obs, rewards, dones, info = powerenv.step(action)
    line = {}
    for i, act in enumerate(action[0]):
        line[i] = act
    data.append(line)

df = pd.DataFrame(data)
df['demand'] = env.get_episode_demand_forecast()[0][:100]
df['sol'] = env.get_episode_solar_forecast()[:100]
df.loc[:, ['demand', 'sol', 3]].plot()
plt.show()
コード例 #16
0
env = Monitor(env, log_dir, allow_early_resets=True)
# env = SubprocVecEnv([make_mujoco_env(env_id, i) for i in range(num_cpu)])
# env = SubprocVecEnv([lambda: env])
env = DummyVecEnv([lambda: env])

# env = SubprocVecEnv([lambda: gym.make('UR5Gripper-v0') for i in range(num_cpu)])

# Add some param noise for exploration
param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1,
                                     desired_action_stddev=0.1)
# Because we use parameter noise, we should use a MlpPolicy with layer normalization
model = DDPG(MlpPolicy,
             env,
             param_noise=param_noise,
             verbose=1,
             tensorboard_log=log_dir)
# model = PPO2(MlpPolicy, env, verbose=1, tensorboard_log=log_dir)
# model = SAC(MlpPolicy, env, verbose=1, tensorboard_log=log_dir)
# Random Agent, before training
mean_reward_before_train = evaluate(model, num_steps=1000)

# Train the agent
model.learn(total_timesteps=int(1e7), callback=callback)

mean_reward_after_train = evaluate(model, num_steps=1000)

obs = env.reset()
for _ in range(1000):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
コード例 #17
0
sc = []
rew = []
done = []

rc2 = []
sc2 = []
rew2 = []
done2 = []

dones = False
i = 0

while dones == False:
    i = i + 1
    x.append(i)
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    #obs=np.array(obs)
    print(obs)
    rc.append(obs[0:2])
    A = np.array([[obs[2], obs[3]], [obs[4], obs[5]]])
    rew.append(rewards)
    done.append(dones)
    print(dones, info)
    #print(type(obs),obs.shape)
    sc.append(A)
    #env.render(mode='human')

    action2, _states2 = model.predict(obs)
    obs2, rewards2, dones2, info2 = env.step_agent()
    #obs=np.array(obs)