n_actions = len(env.get_action_meanings()) print(n_actions) action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) model = DDPG('MlpPolicy', env, action_noise=action_noise, verbose=1) # model = DDPG('MlpPolicy', env, verbose=1) model.learn(total_timesteps=10000, log_interval=10) #model.save("ddpg_pendulum") #env = model.get_env() #del model # remove to demonstrate saving and loading #model = DDPG.load("ddpg_pendulum") score = 0 obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render() score = score + 1 print(dones) if dones: # obs = env.reset() print('finished', score) break
# # model.load('DDPG_test_2_SOC_point5_two_states') mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10) print("Mean Reward = ", mean_reward) epsi_sp_list = [] action_list = [] soc_list = [] Concentration_list = [] Concentration_list1 = [] obs = env.reset() for _ in range(3600): action, _states = model.predict(obs, deterministic=True) obs, rewards, done, info = env.step(action) epsi_sp_list.append(env.epsi_sp.item(0)) # Concentration_list.append(env.state_output['yp'].item()) # Concentration_list.append(env.state_output['yn'].item()) soc_list.append(env.state_of_charge.item()) action_list.append(action) if done: break # obs = env.reset() plt.figure() plt.plot(soc_list)