def test_identity_ddpg(): """ Test if the algorithm (with a given policy) can learn an identity transformation (i.e. return observation as an action) """ env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)]) std = 0.2 param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(std), desired_action_stddev=float(std)) model = DDPG("MlpPolicy", env, gamma=0.0, param_noise=param_noise, memory_limit=int(1e6)) model.learn(total_timesteps=20000, seed=0) n_trials = 1000 reward_sum = 0 set_global_seeds(0) obs = env.reset() for _ in range(n_trials): action, _ = model.predict(obs) obs, reward, _, _ = env.step(action) reward_sum += reward assert reward_sum > 0.9 * n_trials # Free memory del model, env
def DDPGAgent(multi_stock_env, num_episodes): models_folder = 'saved_models' rewards_folder = 'saved_rewards' env = DummyVecEnv([lambda: multi_stock_env]) # the noise objects for DDPG n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # Hyper parameters GAMMA = 0.99 TAU = 0.001 BATCH_SIZE = 16 ACTOR_LEARNING_RATE = 0.0001 CRITIC_LEARNING_RATE = 0.001 BUFFER_SIZE = 500 print("\nRunning DDPG Agent...\n") model = DDPG(MlpPolicy, env, gamma = GAMMA, tau = TAU, batch_size = BATCH_SIZE, actor_lr = ACTOR_LEARNING_RATE, critic_lr = CRITIC_LEARNING_RATE, buffer_size = BUFFER_SIZE, verbose=1, param_noise=param_noise, action_noise=action_noise) model.learn(total_timesteps=50000) model.save(f'{models_folder}/rl/ddpg.h5') del model model = DDPG.load(f'{models_folder}/rl/ddpg.h5') obs = env.reset() portfolio_value = [] for e in range(num_episodes): action, _states = model.predict(obs) next_state, reward, done, info = env.step(action) print(f"episode: {e + 1}/{num_episodes}, episode end value: {info[0]['cur_val']:.2f}") portfolio_value.append(round(info[0]['cur_val'], 3)) # save portfolio value for each episode np.save(f'{rewards_folder}/rl/ddpg.npy', portfolio_value) print("\nDDPG Agent run complete and saved!") a = np.load(f'./saved_rewards/rl/ddpg.npy') print(f"\nCumulative Portfolio Value Average reward: {a.mean():.2f}, Min: {a.min():.2f}, Max: {a.max():.2f}") plt.plot(a) plt.title("Portfolio Value Per Episode (DDPG)") plt.ylabel("Portfolio Value") plt.xlabel("Episodes") plt.show()
def __call__(self, trial): # Calculate an objective value by using the extra arguments. env_id = 'gym_custom:fooCont-v0' env = gym.make(env_id, data=self.train_data) env = DummyVecEnv([lambda: env]) algo = trial.suggest_categorical('algo', ['TD3']) model = 0 if algo == 'PPO2': policy_choice = trial.suggest_categorical('policy', [False, True]) policy = commonMlp if policy_choice else commonMlpLstm model_params = optimize_ppo2(trial) model = PPO2(policy, env, verbose=0, nminibatches=1, **model_params) model.learn(276*7000) elif algo == 'DDPG': policy_choice = trial.suggest_categorical('policy', [False, True]) policy = ddpgLnMlp model_params = sample_ddpg_params(trial) model= DDPG(policy, env, verbose=0, **model_params) model.learn(276*7000) elif algo == 'TD3': policy_choice = trial.suggest_categorical('policy', [False, True]) policy = td3MLP if policy_choice else td3LnMlp model_params = sample_td3_params(trial) model = TD3(policy, env, verbose=0, **model_params) model.learn(276*7000*3) rewards = [] reward_sum = 0.0 env = gym.make(env_id, data=self.test_data) env = DummyVecEnv([lambda: env]) obs = env.reset() for ep in range(1000): for step in range(276): action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) reward_sum += reward if done: rewards.append(reward_sum) reward_sum = 0.0 obs = env.reset()
def train_identity_ddpg(): env = DummyVecEnv([lambda: IdentityEnvBox(eps = 0.5)]) std = 0.2 param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(std), desired_action_stddev=float(std)) model = DDPG("MlpPolicy", env, gamma=0.0, param_noise=param_noise, memory_limit=int(1e6)) model.learn(total_timesteps=20000, seed=0) n_trials = 1000 reward_sum = 0 set_global_seeds(0) obs = env.reset() for _ in range(n_trials): action, _ = model.predict(obs) obs, reward, _, _ = env.step(action) reward_sum += reward assert reward_sum > 0.9 * n_trials del model, env
def run_baseline_ddpg(env_name, train=True): import numpy as np # from stable_baselines.ddpg.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.ddpg.noise import OrnsteinUhlenbeckActionNoise from stable_baselines import DDPG env = gym.make(env_name) env = DummyVecEnv([lambda: env]) if train: # mlp from stable_baselines.ddpg.policies import FeedForwardPolicy class CustomPolicy(FeedForwardPolicy): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs, layers=[64, 64, 64], layer_norm=True, feature_extraction="mlp") # the noise objects for DDPG n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions)+0.15, sigma=0.3 * np.ones(n_actions)) model = DDPG(CustomPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, tau=0.01, observation_range=(env.observation_space.low, env.observation_space.high), critic_l2_reg=0, actor_lr=1e-3, critic_lr=1e-3, memory_limit=100000) model.learn(total_timesteps=1e5) model.save("checkpoints/ddpg_" + env_name) else: model = DDPG.load("checkpoints/ddpg_" + env_name) obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render() print("state: ", obs, " reward: ", rewards, " done: ", dones, "info: ", info) del model # remove to demonstrate saving and loading
def optimize_agent(trial): """ Train the model and optimise Optuna maximises the negative log likelihood, so we need to negate the reward here """ model_params = optimize_ddpg(trial) seed = trial.suggest_int('numpyseed', 1, 429496729) np.random.seed(seed) original_env = gym.make('rustyblocks-v0') original_env.max_invalid_tries = 3 env = DummyVecEnv([lambda: original_env]) model = DDPG("MlpPolicy", env, verbose=0, observation_range=(-126,126), **model_params) print("DOING LEARING a2c") original_env.force_progression = False model.learn(int(2e4*5), seed=seed) print("DONE LEARING a2c") original_env.max_invalid_tries = -1 rewards = [] n_episodes, reward_sum = 0, 0.0 obs = env.reset() original_env.force_progression = True original_env.invalid_try_limit = 5000 while n_episodes < 4: action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) reward_sum += reward if done: rewards.append(reward_sum) reward_sum = 0.0 n_episodes += 1 obs = env.reset() last_reward = np.mean(rewards) trial.report(last_reward) return last_reward
def DDPGgive_results(files, balance, shares=None): env = create_stock_env(files, train=False, balance=balance, shares=shares) max_steps = env.max_steps - env.num_prev env = DummyVecEnv([lambda: env]) n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(0, 2) param_noise = AdaptiveParamNoiseSpec(initial_stddev=1, desired_action_stddev=0.1, adoption_coefficient=1.01) model = DDPG(CustomDDPGPolicy, env, verbose=0, param_noise=param_noise, action_noise=action_noise) # model = DDPG.load("/home/harshit/Documents/itsp-trade agent/Reinforcement-Learning-Stock-Trader/WebPortal/StockApp/Stock_stable.zip",env=env) model.learn(total_timesteps=100) profit = 0 profitst = np.zeros((max_steps - 1, 2)) actionst = np.zeros((n_actions // 2, max_steps - 1, 2)) shares = np.zeros((len(files), max_steps - 1, 2)) obs = env.reset() for i in range(max_steps): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) actionst[:, i, 1] = -info[0]['action'][0][0:n_actions // 2] + info[0][ 'action'][0][n_actions // 2:] actionst[:, i, 0] = i shares[:, i, 1] = info[0]['shares_held'] shares[:, i, 0] = i # print('a',action) profit += rewards profitst[i] = [i, profit] if dones: break print(info[0]['action'][0]) print(actionst) return profitst.tolist(), shares.tolist(), actionst.tolist()
def train_decision(config=None, save=False, load=False, calender=None, history=None, predict_results_dict=None, test_mode=False, start_date=None, stop_date=None, episode_steps=1000, model='DDPG'): """ 训练决策模型,从数据库读取数据并进行决策训练 参数: config:配置文件, save:保存结果, calender:交易日日历, history:行情信息, all_quotes:拼接之后的行情信息 predict_results_dict:预测结果信息 """ # 首先处理预测数据中字符串日期 MODEL = model predict_dict = {} for k, v in predict_results_dict.items(): assert isinstance(v['predict_date'].iloc[0], str) tmp = v['predict_date'].apply( lambda x: arrow.get(x, 'YYYY-MM-DD').date()) predict_dict[k] = v.rename(index=tmp) env = Portfolio_Prediction_Env(config=config, calender=calender, stock_history=history, window_len=1, prediction_history=predict_dict, start_trade_date=start_date, stop_trade_date=stop_date, save=save) # 测试模式 if test_mode: obs = env.reset() # check_env(env) for i in range(1000): W = np.random.uniform(0.0, 1.0, size=(6, )) offer = np.random.uniform(-10.0, 10.0, size=(6, )) obs, reward, done, infos = env.step(np.hstack((W, offer))) # env.render() if done: env.save_history() break env.close() # 训练模式 if MODEL == "DDPG": # 添加噪声 n_actions = env.action_space.shape param_noise = None # 适合于惯性系统控制的OU噪声 action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) model_path = search_file( os.path.join(sys.path[0], 'saved_models', MODEL), MODEL) if len(model_path) > 0 and load: model = DDPG.load( model_path[0], env=env, policy=CustomDDPGPolicy, param_noise=param_noise, action_noise=action_noise, # tensorboard_log='./tb_log', ) else: model = DDPG( policy=CustomDDPGPolicy, env=env, verbose=1, param_noise=param_noise, action_noise=action_noise, # tensorboard_log='./tb_log', ) # 训练步数 model.learn(total_timesteps=episode_steps, ) model.save( os.path.join(sys.path[0], 'saved_models', MODEL, MODEL + '.h5')) elif MODEL == 'TD3': n_actions = env.action_space.shape[-1] # 适合于惯性系统控制的OU噪声 action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) model_path = search_file( os.path.join(sys.path[0], 'saved_models', MODEL), MODEL) if len(model_path) > 0 and load: model = TD3.load( model_path[0], env=env, policy=CustomTD3Policy, action_noise=action_noise, # tensorboard_log='./tb_log', ) else: model = TD3( policy=CustomTD3Policy, env=env, verbose=1, action_noise=action_noise, # tensorboard_log='./tb_log', ) # 训练步数 model.learn(total_timesteps=episode_steps, ) model.save( os.path.join(sys.path[0], 'saved_models', MODEL, MODEL + '.h5')) elif MODEL == "HER": """ env必须是GoalEnv """ model_class = DDPG # Available strategies (cf paper): future, final, episode, random goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE # Wrap the model model = HER(policy=CustomDDPGPolicy, env=env, model_class=model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1) model.learn(total_timesteps=episode_steps, ) model.save( os.path.join(sys.path[0], 'saved_models', MODEL, MODEL + '.h5')) obs = env.reset() # 实测模式 for i in range(1000): action, _states = model.predict(obs) obs, reward, done, info = env.step(action) # env.render(info=info) if done: if save: env.save_history() env.reset() break env.close()
model = DDPG(MlpPolicy, env, verbose=1, param_noise=None, action_noise=action_noise) # Train the model model.learn(1000) model.save("./hideandseek") # WARNING: you must pass an env # or wrap your environment with HERGoalEnvWrapper to use the predict method model = HER.load('./hideandseek', env=env) obs = env.reset() for _ in range(100): action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) if done: obs = env.reset() # print(main.__doc__) #if __name__ == '__main__': # logging.getLogger('').handlers = [] # logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) #main()
# The replay buffer is used to store experience, because DDPG is an off-policy algorithm. # A target network is designed to minimize MSBE loss. # A target policy network to compute an action which approximately maximizes Q_{\phi_{\text{targ}}}. # Ornstein-Uhlenbeck process is applied to add exploration noise during training to make DDPG policies explore better. model = DDPG(MlpPolicy, env, verbose=1, tau=tau, gamma=gamma, batch_size=batch_size, actor_lr=alr, critic_lr=clr, param_noise=param_noise, action_noise=action_noise) if __name__ == '__main__': # train model.learn(total_timesteps=10000) model.save("DDPG_baselines") # play env = OsmoEnv() for i in range(10): observation = env.reset() done = False while not done: action, _ = model.predict(observation) observation, reward, done, info = env.step(action) # print(reward) print(info)
del model_3 del model_4 print( "*************************************\n Model 1 Result \n*************************************" ) model = DDPG.load("ddpg_copter") n_episode = 10 episode_reward = np.zeros(n_episode) for i in range(n_episode): obs = env.reset() sum_reward = 0 while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) sum_reward += rewards if dones: break env.render() episode_reward[i] = sum_reward model_1_result = np.mean(episode_reward) print( "*************************************\n Model 2 Result \n*************************************" ) model_2 = DDPG.load("ddpg_copter_2") n_episode = 10
import random import numpy as np from stable_baselines.ddpg.policies import MlpPolicy from stable_baselines import DDPG env = gym.make('JetBot-v0') model = DDPG(MlpPolicy, env, verbose=1) model.learn(total_timesteps=100) model.save('ddpg_jetbot') #model = DDPG.load('ddpg_jetbot') episodes = 50 env.reset() for episode in range(episodes): observation = env.reset() score = 0 done = False while not done: action = model.predict(observation, deterministic=True) observation, reward, done, info = env.step(action) print('obs=', observation, ' | reward=', reward, ' | done=', done) score += reward if done: GPIO.cleanup() print("Episode ", episode + 1, "/", episodes, " finished with a score of: ", score) break
def main(): # Params global train, predict, log_dir, total_timesteps # config = tf.ConfigProto() # config.gpu_options.allow_growth = True # config = tf.ConfigProto(allow_soft_placement=True) # sess = tf.Session(tf.ConfigProto(allow_soft_placement=True)) # sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)) env = gym.make('MountainCarContinuous-v0') # env = DummyVecEnv([lambda: env]) # Create and wrap the environment # env = gym.make('MountainCarContinuous-v0') # env = gym.make('FetchReach-v0') # env = DummyVecEnv([lambda: env]) # Create and wrap the environment # env = gym.make('LunarLanderContinuous-v2') # env = Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([lambda: env]) # Logs will be saved in log_dir/monitor.csv # env = Monitor(env, log_dir, allow_early_resets=True) if train == True: # the noise objects for DDPG n_actions = env.action_space.shape[-1] # print(n_actions) param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # tensorboard_log="/tmp/ddpg_MountainCarContinious_tensorboard/", model = DDPG(MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, render=True) # model = DDPG(MlpPolicy, env, verbose=1) # model.learn(total_timesteps=total_timesteps, callback=callback) model.learn(total_timesteps=total_timesteps) # model = PPO2(MlpPolicy, env, verbose=1) # model.learn(total_timesteps=total_timesteps) model.save("MountainCarContinuous") # model.save("ddpg_mountain") # model.save("ppo2_mountain") #Show we have finished with the learning print("Finisehd with learning") # plot_results(log_dir) del model # remove to demonstrate saving and loading # End if if predict == True: # model = DDPG.load("ddpg_mountain_40000") # Best one! model = DDPG.load("MountainCarContinuous") # model = DDPG.load("ddpg_mountain") # model = PPO2.load("ppo2_mountain") obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--generate_pretrain", type=int, default=0, help="If true, launch an interface to generate an expert trajectory") parser.add_argument( "--train", type=int, default=1, help="True: training, False: using a trained model") parser.add_argument( "--algo", type=str, default="ppo2", help="The learning algorithm to be used (ppo2 or ddpg)") parser.add_argument( "--model", type=str, default="", help="The version name of the model") parser.add_argument( "--gui", type=int, default=1, help="Wether the GUI of the simulation should be used or not. 0 or 1") args = parser.parse_args() algo = args.algo.lower() try: assert args.gui == 0 or args.gui == 1 assert algo == "ppo2" or algo == "ddpg" except AssertionError as e: print(str(e)) return env = RobotEnv(gui=args.gui) vec_env = DummyVecEnv([lambda: env]) # Generate an expert trajectory if args.generate_pretrain: pass # Train a model elif args.train == 1: while True: req = Request( "https://frightanic.com/goodies_content/docker-names.php", headers={'User-Agent': 'Mozilla/5.0'}) webpage = str(urlopen(req).read()) word = webpage.split("b\'")[1] word = word.split("\\")[0] word.replace(" ", "_") try: assert os.path.isfile( "models/" + algo + "_throw_" + word + ".pkl") except AssertionError: break log_name = "./logs/throw/" + word if algo == "ppo2": # For recurrent policies, nminibatches should be a multiple of the # nb of env used in parallel (so for LSTM, 1) model = PPO2( MlpLstmPolicy, vec_env, nminibatches=1, verbose=0, tensorboard_log=log_name) elif algo == "ddpg": action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(env.action_space.shape[-1]), sigma=float(0.5) * np.ones(env.action_space.shape[-1])) model = DDPG( stable_baselines.ddpg.LnMlpPolicy, env, verbose=0, param_noise=None, action_noise=action_noise, tensorboard_log=log_name) try: model.learn(total_timesteps=1000000) except KeyboardInterrupt: print("#---------------------------------#") print("Training \'" + word + "\' interrupted") print("#---------------------------------#") sys.exit(1) model.save("models/" + algo + "_throw_" + word) # Use a trained model else: if args.model == "": print("Specify the version of the model using --model") return if algo == "ppo2": model = PPO2.load("models/" + algo + "_throw_" + args.model) elif algo == "ddpg": model = DDPG.load("models/" + algo + "_throw_" + args.model) for test in range(10): dones = False obs = env.reset() while not dones: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) time.sleep(2) env._termination()
path = 'models/' + model_name + '.pkl' powermodel.save('models/' + model_name) with open('models/' + model_name + '_params.p', 'wb') as f: pickle.dump(env.params, f) model_name = '800k_full' path = 'models/' + model_name + '.pkl' i = 2 while os.path.isfile(path): model_name += '_' + str(i) i += 1 path = 'models/' + model_name + '.pkl' powermodel.save('models/' + model_name) with open('models/' + model_name + '_params.p', 'wb') as f: pickle.dump(env.params, f) for i in range(100): action, _ = powermodel.predict(obs) obs, rewards, dones, info = powerenv.step(action) line = {} for i, act in enumerate(action[0]): line[i] = act data.append(line) df = pd.DataFrame(data) df['demand'] = env.get_episode_demand_forecast()[0][:100] df['sol'] = env.get_episode_solar_forecast()[:100] df.loc[:, ['demand', 'sol', 3]].plot() plt.show()
env = Monitor(env, log_dir, allow_early_resets=True) # env = SubprocVecEnv([make_mujoco_env(env_id, i) for i in range(num_cpu)]) # env = SubprocVecEnv([lambda: env]) env = DummyVecEnv([lambda: env]) # env = SubprocVecEnv([lambda: gym.make('UR5Gripper-v0') for i in range(num_cpu)]) # Add some param noise for exploration param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1, desired_action_stddev=0.1) # Because we use parameter noise, we should use a MlpPolicy with layer normalization model = DDPG(MlpPolicy, env, param_noise=param_noise, verbose=1, tensorboard_log=log_dir) # model = PPO2(MlpPolicy, env, verbose=1, tensorboard_log=log_dir) # model = SAC(MlpPolicy, env, verbose=1, tensorboard_log=log_dir) # Random Agent, before training mean_reward_before_train = evaluate(model, num_steps=1000) # Train the agent model.learn(total_timesteps=int(1e7), callback=callback) mean_reward_after_train = evaluate(model, num_steps=1000) obs = env.reset() for _ in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
sc = [] rew = [] done = [] rc2 = [] sc2 = [] rew2 = [] done2 = [] dones = False i = 0 while dones == False: i = i + 1 x.append(i) action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) #obs=np.array(obs) print(obs) rc.append(obs[0:2]) A = np.array([[obs[2], obs[3]], [obs[4], obs[5]]]) rew.append(rewards) done.append(dones) print(dones, info) #print(type(obs),obs.shape) sc.append(A) #env.render(mode='human') action2, _states2 = model.predict(obs) obs2, rewards2, dones2, info2 = env.step_agent() #obs=np.array(obs)