def run_adhdp_offpolicy(rounds=1000, seed=random.randint(0, 1000000), name='CoQL', train_rounds=100, train_step_in_round=100): print('seed :', seed) random.seed(seed) np.random.seed(seed) from Control_Exp1001.demo.thickener.adhdp_make import new_adhdp adhdp = new_adhdp(capacity=4, off_policy=True) penalty = Quadratic(**penalty_para) env_adhdp = Thickener( penalty_calculator=penalty, **thickner_para, ) env_adhdp.reset() res1 = OffLineExp(controller=adhdp, env=env_adhdp, max_step=rounds, exp_name=name, train_rounds=train_rounds, train_step_in_round=train_step_in_round).run() return res1
def run(): # 定义积分惩罚项 penalty = IntegralPenalty(weight_matrix=[200, 0.02], S=[0.1, 0.1]) #penalty = IntegralPenalty(weight_matrix=[1,1], S=[0.00001,0.00001]) # 定义初始化env对象的参数 env_para = { "dt": 1, "normalize": False, "noise_in": False, "penalty_calculator": penalty, "y_star": [1.55, 650], "y_start": [1.4, 680] #"y_star": np.array([17.32, 0.84], dtype=float) } env = Thickener(**env_para) env.reset() # 回放池大小为1,batch_size为1 replaybuff = ReplayBuffer(capacity=1) # 参照论文给出的参数 controller = ILPL(env=env, u_bounds=env.u_bounds, replay_buffer=replaybuff, Vm=np.diag([1, 0.01, 0.1, 0.1, 0.1, 0.1]), Lm=np.diag([1, 0.01]), Va=np.diag([1, 0.01, 1, 0.01, 0.1, 0.1]), La=np.diag([1, 1]), Vc=np.diag([1, 0.01, 1, 0.01, 0.1, 0.1]), Lc=np.diag([0.1]), predict_training_rounds=5000, gamma=0.6, batch_size=1, predict_batch_size=32, model_nn_error_limit=0.08, critic_nn_error_limit=0.1, actor_nn_loss=0.6, u_iter=30, u_begin=[80, 38], indice_y=[2, 3], indice_y_star=[0, 1], u_first=[80, 38]) # 定义实验块 exp = OneRoundExp(env=env, controller=controller, max_step=300, exp_name="ILPL") res = exp.run() eval_res = OneRoundEvaluation(res_list=[res]) eval_res.plot_all()
def new_dhp_vi(): capacity= 20 predict_round=6000 gamma=0.6 replay_DhpVI = ReplayBuffer(capacity=capacity) env_DhpVI = Thickener( noise_p=0.03, noise_in=True, ) exploration = No_Exploration() print('make new dhp_vi controller') dhp_vi = DhpVI( replay_buffer = replay_DhpVI, u_bounds = env_DhpVI.u_bounds, #exploration = None, exploration = exploration, env=env_DhpVI, predict_training_rounds=predict_round, gamma=gamma, batch_size = 20, predict_batch_size=32, model_nn_error_limit = 0.0008, critic_nn_error_limit = 0.01, actor_nn_error_limit = 0.001, # 0.005 actor_nn_lr = 0.005, critic_nn_lr = 0.001, model_nn_lr = 0.01, indice_y = None, indice_y_star = None, indice_c=None, hidden_model = 10, hidden_critic = 12, hidden_actor = 14, predict_epoch= 30, Na=2000, Nc=100, test_period=3, max_u_iters=2000, policy_visual_period=400, img_path=EXP_NAME ) env_DhpVI.reset() dhp_vi.train_identification_model() return dhp_vi
def new_vi(): capacity=2 #经验池的大小,需要大于或等于batch_size predict_round=3000 u_optim='sgd' # 寻找u*使用的梯度下降算法 gamma=0.6 replay_vi = ReplayBuffer(capacity=capacity) # 这个浓密机是用来生成数据训练预测模型用的 env_VI = Thickener( noise_p=0.03, noise_in=True, ) exploration = No_Exploration() print('make new vi controller') vi = VI( replay_buffer = replay_vi, u_bounds = env_VI.u_bounds, #exploration = None, exploration = exploration, env=env_VI, predict_training_rounds=predict_round, gamma=gamma, batch_size = capacity, predict_batch_size=32, model_nn_error_limit = 0.0008, critic_nn_error_limit = 0.001, actor_nn_error_limit = 0.001, actor_nn_lr = 0.005, critic_nn_lr = 0.02, model_nn_lr = 0.01, indice_y = None, indice_y_star = None, indice_c=None, hidden_model = 10, hidden_critic = 14, hidden_actor = 14, predict_epoch= 30, Nc=500, u_optim=u_optim, img_path=EXP_NAME ) env_VI.reset() vi.train_identification_model() #vi.test_predict_model(test_rounds=100) return vi
def new_dhp(): capacity= 1 predict_round=6000 gamma=0.6 replay_DHP = ReplayBuffer(capacity=capacity) env_DHP = Thickener( noise_p=0.03, noise_in=True, ) exploration = No_Exploration() print('make new dhp controller') dhp = DHP( replay_buffer = replay_DHP, u_bounds = env_DHP.u_bounds, #exploration = None, exploration = exploration, env=env_DHP, predict_training_rounds=predict_round, gamma=gamma, batch_size = 1, predict_batch_size=32, model_nn_error_limit = 0.0008, critic_nn_error_limit = 0.01, actor_nn_error_limit = 0.001, # 0.005 actor_nn_lr = 0.005, critic_nn_lr = 0.001, model_nn_lr = 0.01, indice_y = None, indice_y_star = None, indice_c=None, hidden_model = 10, hidden_critic = 12, hidden_actor = 14, predict_epoch= 30, Na=220, Nc=100, test_period=3, img_path=EXP_NAME, ) env_DHP.reset() dhp.train_identification_model() return dhp
def run_hdp(rounds=1000, seed=random.randint(0, 1000000), name='HDP', predict_round=800): hdp_para = {'gamma': 0.9} print('seed :', seed) torch.manual_seed(seed) random.seed(seed) np.random.seed(seed) from Control_Exp1001.demo.thickener.hdp_maker import new_hdp hdp = new_hdp(predict_round=predict_round, **hdp_para) penalty = Quadratic(**penalty_para) env_hdp = Thickener( penalty_calculator=penalty, random_seed=seed, **thickner_para, ) res1 = OneRoundExp(controller=hdp, env=env_hdp, max_step=rounds, exp_name=name).run() return res1
def run_vi_ub(rounds=1000, seed=random.randint(0, 1000000), name='VI_uk', capacity=2, predict_round=3000, u_optim='sgd'): print('seed :', seed) torch.manual_seed(seed) from Control_Exp1001.demo.thickener.vi_ub_maker import new_vi_ub vi = new_vi_ub(capacity=capacity, predict_round=predict_round, u_optim=u_optim) penalty = Quadratic(**penalty_para) env_vi = Thickener( penalty_calculator=penalty, **thickner_para, ) mse_vi_pre.append(vi.con_predict_mse) res1 = OneRoundExp(controller=vi, env=env_vi, max_step=rounds, exp_name=name).run() print(name, ':', vi.u_iter_times * 1.0 / rounds) return res1
def run_dhp_vi( rounds=1000, seed=random.randint(0, 1000000), name='DHPVI', capacity=2, predict_round=3000, u_optim='adam', ): print('seed :', seed) torch.manual_seed(seed) dhp_vi_para = { #'gamma': 0.2 } dhp_vi = new_dhp_vi() specific_penalty_para = copy.deepcopy(penalty_para) specific_penalty_para['S'] = [0.0001, 0.0008] penalty = Quadratic(**specific_penalty_para) env_dhp_vi = Thickener( penalty_calculator=penalty, **thickner_para, ) res1 = OneRoundExp(controller=dhp_vi, env=env_dhp_vi, max_step=rounds, exp_name=name).run() return res1
def new_adhdp(capacity=2, off_policy=False): replay_hdp = ReplayBuffer(capacity=capacity) env_ADHDP = Thickener() #exploration = No_Exploration() exploration = EGreedy(env_ADHDP.external_u_bounds, epsilon_start=0.5, epsilon_final=0, epsilon_decay=1000) adhdp = ADHDP( replay_buffer=replay_hdp, u_bounds=env_ADHDP.u_bounds, #exploration = None, exploration=exploration, env=env_ADHDP, gamma=0.7, batch_size=capacity, predict_batch_size=32, critic_nn_error_limit=0.02, actor_nn_error_limit=0.001, actor_nn_lr=0.01, critic_nn_lr=0.01, indice_y=None, indice_y_star=None, indice_c=None, hidden_critic=6, hidden_actor=6, max_iter_c=50, off_policy=off_policy, ) return adhdp
def run_vi( rounds=1000, seed=random.randint(0, 1000000), name='VI', capacity=2, predict_round=3000, u_optim='adam', ): print('seed :', seed) torch.manual_seed(seed) vi_para = {'gamma': 0.2} vi = new_vi() penalty = Quadratic(**penalty_para) env_vi = Thickener( penalty_calculator=penalty, **thickner_para, ) res1 = OneRoundExp(controller=vi, env=env_vi, max_step=rounds, exp_name=name).run() print(name, ':', vi.u_iter_times * 1.0 / rounds) return res1
def new_hdp(): predict_round=3000 gamma=0.6 replay_hdp = ReplayBuffer(capacity=2) env_HDP = Thickener( noise_p=0.03, noise_in=True, ) exploration = No_Exploration() print('make new hdp controller') hdp = HDP( replay_buffer = replay_hdp, u_bounds = env_HDP.u_bounds, #exploration = None, exploration = exploration, env=env_HDP, predict_training_rounds=predict_round, gamma=gamma, batch_size = 2, predict_batch_size=32, model_nn_error_limit = 0.0008, critic_nn_error_limit = 0.001, actor_nn_error_limit = 0.001, # 0.005 actor_nn_lr = 0.003, critic_nn_lr = 0.02, model_nn_lr = 0.01, indice_y = None, indice_y_star = None, indice_c=None, hidden_model = 10, hidden_critic = 14, hidden_actor = 14, predict_epoch= 30, Na=220, Nc = 500, img_path=EXP_NAME ) env_HDP.reset() hdp.train_identification_model() return hdp
def new_vi_ub(): capacity=2 predict_round=3000 u_optim='sgd' replay_vi = ReplayBuffer(capacity=capacity) env_VI = Thickener( noise_p=0.03, noise_in=True, ) exploration = No_Exploration() print('make new viuk controller') vi = VIub( replay_buffer = replay_vi, u_bounds = env_VI.u_bounds, #exploration = None, exploration = exploration, env=env_VI, predict_training_rounds=predict_round, gamma=0.6, batch_size = capacity, predict_batch_size=32, model_nn_error_limit = 0.0008, critic_nn_error_limit = 0.001, actor_nn_error_limit = 0.001, actor_nn_lr = 0.005, critic_nn_lr = 0.01, model_nn_lr = 0.01, indice_y = None, indice_y_star = None, indice_c=None, hidden_model = 10, hidden_critic = 14, hidden_actor = 14, predict_epoch= 30, u_optim=u_optim, find_lr= 0.4, find_time_max=20 ) env_VI.reset() vi.train_identification_model() return vi
def run_adhdp(rounds=1000, seed=random.randint(0, 1000000), name='ADHDP'): print('seed :', seed) random.seed(seed) np.random.seed(seed) from Control_Exp1001.demo.thickener.adhdp_make import new_adhdp adhdp = new_adhdp(capacity=9) penalty = Quadratic(**penalty_para) env_adhdp = Thickener( penalty_calculator=penalty, **thickner_para, ) env_adhdp.reset() res1 = OneRoundExp(controller=adhdp, env=env_adhdp, max_step=rounds, exp_name=name).run() return res1
def run_adhdp(rounds=1000, seed=random.randint(0, 1000000)): print('seed :', seed) random.seed(seed) np.random.seed(seed) penalty = Quadratic(**penalty_para) env_adhdp = Thickener( penalty_calculator=penalty, **thickner_para, random_seed=seed, ) env_adhdp.reset() res1 = OneRoundExp(controller=adhdp, env=env_adhdp, max_step=rounds, exp_name='ADHDP').run() eval_res = OneRoundEvaluation(res_list=[res1]) eval_res.plot_all()
def test_model_hidden(): env = Thickener(noise_in=True) env.reset() loss_list = [] hid_size_list = [] for hidden_size in range(6, 30, 2): controller = HDP( replay_buffer=None, u_bounds=env.u_bounds, env=env, predict_training_rounds=10000, gamma=0.6, batch_size=1, predict_batch_size=32, model_nn_error_limit=0.00008, critic_nn_error_limit=0.9, actor_nn_error_limit=0.1, actor_nn_lr=0.003, critic_nn_lr=0.2, model_nn_lr=0.01, indice_y=None, indice_y_star=None, indice_c=None, hidden_model=hidden_size, hidden_critic=10, hidden_actor=10, predict_epoch=40, ) hid_size_list.append(hidden_size) controller.train_identification_model() loss = controller.cal_predict_mse(test_rounds=3000) loss_list.append(loss) plt.plot(hid_size_list, loss_list) plt.legend(['loss in test']) plt.show()
def run_dhp(rounds=800,seed=random.randint(0,1000000),name='DHP',capacity=2, predict_round=3000,u_optim='adam',): print('seed :',seed) torch.manual_seed(seed) dhp = new_dhp() penalty = Quadratic(**penalty_para) env_dhp = Thickener( penalty_calculator=penalty, **thickner_para, ) res1 = OneRoundExp(controller=dhp, env=env_dhp,max_step=rounds, exp_name=name).run() return res1
def run_hdp(rounds=1000,seed=random.randint(0,1000000),name='HDP', predict_round=800): print('seed :',seed) hdp_para = { 'gamma':0.2 } hdp = new_hdp() penalty = Quadratic(**penalty_para) env_hdp = Thickener( penalty_calculator=penalty, **thickner_para, ) res1 = OneRoundExp(controller=hdp, env=env_hdp,max_step=rounds, exp_name=name).run() return res1
def new_vi_sample(capacity=2, predict_round=3000): replay_vi_sample = ReplayBuffer(capacity=capacity) env_VI_sample = Thickener( noise_p=0.03, noise_in=True, ) exploration = No_Exploration() print('make new vi_sample controller') vi_sample = ViSample( replay_buffer=replay_vi_sample, u_bounds=env_VI_sample.u_bounds, #exploration = None, exploration=exploration, env=env_VI_sample, predict_training_rounds=predict_round, gamma=0.4, batch_size=capacity, predict_batch_size=32, model_nn_error_limit=0.0008, critic_nn_error_limit=0.001, actor_nn_error_limit=0.001, actor_nn_lr=0.005, critic_nn_lr=0.01, model_nn_lr=0.01, indice_y=None, indice_y_star=None, indice_c=None, hidden_model=10, hidden_critic=14, hidden_actor=14, predict_epoch=30, ) env_VI_sample.reset() vi_sample.train_identification_model() vi_sample.test_predict_model(test_rounds=100) return vi_sample
def run_hdp_sample(rounds=1000, seed=random.randint(0, 1000000)): print('seed :', seed) print('hdp_sample') from Control_Exp1001.demo.thickener.hdp_sample_maker import hdp_sample penalty = Quadratic(**penalty_para) env_hdp = Thickener( penalty_calculator=penalty, **thickner_para, ) res1 = OneRoundExp(controller=hdp_sample, env=env_hdp, max_step=rounds, exp_name='HDP_sample').run() return res1
def new_adhdp(random_act=False): period = 20 capacity = period train_period = period batch_size = period off_policy = False replay_hdp = ReplayBuffer(capacity=capacity) env_ADHDP = Thickener() #exploration = No_Exploration() #exploration = EGreedy(env_ADHDP.external_u_bounds, epsilon_start=0.6,epsilon_final=0,epsilon_decay=10) exploration = GaussianExploration( action_bounds=env_ADHDP.external_u_bounds, min_sigma=0.00, max_sigma=0.01, decay_period=600) if random_act: exploration = EGreedy(action_bounds=env_ADHDP.external_u_bounds, epsilon_start=1, epsilon_final=1, epsilon_decay=100) train_period = 20 adhdp = ADHDP( replay_buffer=replay_hdp, u_bounds=env_ADHDP.u_bounds, #exploration = None, exploration=exploration, env=env_ADHDP, gamma=0.8, batch_size=batch_size, predict_batch_size=32, critic_nn_error_limit=0.05, actor_nn_error_limit=0.001, actor_nn_lr=0.003, critic_nn_lr=0.05, indice_y=None, indice_y_star=None, indice_c=None, hidden_critic=16, hidden_actor=20, off_policy=off_policy, Nc=1000, Na=50, train_period=train_period, test_period=1) return adhdp
def run_vi_sample(rounds=1000, seed=random.randint(0, 1000000), name='VI_sample', capacity=2, predict_round=3000): print('seed :', seed) torch.manual_seed(seed) from Control_Exp1001.demo.thickener.vi_sample_maker import new_vi_sample vi_sample = new_vi_sample(capacity=capacity, predict_round=predict_round) penalty = Quadratic(**penalty_para) env_vi_sample = Thickener( penalty_calculator=penalty, **thickner_para, ) res1 = OneRoundExp(controller=vi_sample, env=env_vi_sample, max_step=rounds, exp_name=name).run() mse_vi_sample_pre.append(vi_sample.con_predict_mse) return res1
def run_adhdp(rounds=1000, seed=random.randint(0, 1000000), name='ADHDP', predict_round=800, random_act=False): print('seed :', seed) torch.manual_seed(seed) random.seed(seed) np.random.seed(seed) adhdp = new_adhdp(random_act=random_act) penalty = Quadratic(**penalty_para) env_hdp = Thickener( penalty_calculator=penalty, random_seed=seed, **thickner_para, ) res1 = OneRoundExp(controller=adhdp, env=env_hdp, max_step=rounds, exp_name=name).run() return res1
def __init__(self, gpu_id=1, replay_buffer = None, u_bounds = None, exploration = None, env=None, predict_training_rounds=10000, Vm=None, Lm=None, Va=None, La=None, Vc=None, Lc=None, gamma=0.6, batch_size = 1, predict_batch_size=32, model_nn_error_limit = 0.08, critic_nn_error_limit = 1, actor_nn_loss = 0.1, u_iter=30, u_begin = None, indice_y = None, indice_y_star = None, indice_c = None, u_first = None ): """ :param gpu_id: :param replay_buffer: :param u_bounds: :param exploration: :param env: :param predict_training_rounds: 训练预测模型时使用的真实数据条数 :param Vm: :param Lm: :param Va: :param La: :param Vc: :param Lc: :param gamma: :param batch_size: :param predict_batch_size: 训练预测模型时的batch_size :param model_nn_error_limit: :param critic_nn_error_limit: critic网络的误差限 :param actor_nn_loss: :param u_iter: 求解u*时的迭代次数 :param u_begin: 求解u*时,第一次迭代的其实u(k) :param indice_y: y在state中的位置 :param indice_y_star: *在state中的位置 :param u_first: 第一次控制时的命令 """ super(ILPL, self).__init__(gpu_id=gpu_id,replay_buffer=replay_buffer, u_bounds=u_bounds,exploration=exploration) if env is None: env = Thickener() self.env=env self.predict_training_rounds = predict_training_rounds self.device = None self.cuda_device(gpu_id) self.batch_size = batch_size self.predict_batch_size = predict_batch_size self.indice_c = [6, 7] self.predict_training_losses = [] self.model_nn = None self.model_nn_error_limit = model_nn_error_limit self.critic_nn_error_limit = critic_nn_error_limit self.actor_nn_error_limit = actor_nn_loss self.u_iter = u_iter # Train model neural network self.train_identification_model(Vm=Vm,Lm=Lm) self.test_predict_model(test_rounds=400) #定义actor网络相关 self.actor_nn = None self.actor_nn_init(Va=Va,La=La) #定义critic网络相关 self.critic_nn = None self.critic_nn_init(Vc=Vc,Lc=Lc) self.gamma = gamma self.u_begin = u_begin if indice_y is None: indice_y = [2,3] if indice_y_star is None: indice_y_star = [0,1] self.indice_y = indice_y self.indice_y_star = indice_y_star if u_first is None: u_first = np.array([1.8, 19]) self.u_first = u_first self.first_act = True # 用来画图用 self.u0_plt = PltUtil() self.u1_plt = PltUtil() self.y0_plt = PltUtil() self.y1_plt = PltUtil() self.wa_plt = PltUtil() self.wm_plt = PltUtil() self.wc_plt = PltUtil()
def __init__(self, gpu_id=1, replay_buffer=None, u_bounds=None, exploration=None, env=None, predict_training_rounds=10000, gamma=0.6, batch_size=1, predict_batch_size=32, model_nn_error_limit=0.08, critic_nn_error_limit=1, actor_nn_error_limit=0.1, actor_nn_lr=0.01, critic_nn_lr=0.01, model_nn_lr=0.01, indice_y=None, indice_u=None, indice_y_star=None, indice_c=None, hidden_model=10, hidden_critic=10, hidden_actor=10, predict_epoch=35): """ :param gpu_id: :param replay_buffer: :param u_bounds: :param exploration: :param env: :param predict_training_rounds: 训练预测模型时使用的真实数据条数 :param Vm: :param Lm: :param Va: :param La: :param Vc: :param Lc: :param gamma: :param batch_size: :param predict_batch_size: 训练预测模型时的batch_size :param model_nn_error_limit: :param critic_nn_error_limit: critic网络的误差限 :param actor_nn_loss: :param u_iter: 求解u*时的迭代次数 :param u_begin: 求解u*时,第一次迭代的其实u(k) :param indice_y: y在state中的位置 :param indice_y_star: *在state中的位置 :param u_first: 第一次控制时的命令 """ super(HDP_sample, self).__init__(gpu_id=gpu_id, replay_buffer=replay_buffer, u_bounds=u_bounds, exploration=exploration) if env is None: env = Thickener() self.env = env self.predict_training_rounds = predict_training_rounds self.device = None self.cuda_device(gpu_id) self.batch_size = batch_size self.predict_batch_size = predict_batch_size self.predict_training_losses = [] self.model_nn = None self.model_nn_error_limit = model_nn_error_limit self.critic_nn_error_limit = critic_nn_error_limit self.actor_nn_error_limit = actor_nn_error_limit dim_c = env.size_yudc[3] dim_y = env.size_yudc[0] dim_u = env.size_yudc[1] # Train model neural network self.model_nn = nn.Sequential( nn.Linear(dim_y + dim_u + dim_c, hidden_model), nn.Tanh(), nn.Linear(hidden_model, dim_y)) self.model_nn_optim = torch.optim.Adam(self.model_nn.parameters(), lr=model_nn_lr) #self.train_identification_model() #mse = self.test_predict_model(test_rounds=400) #定义actor网络相关 self.actor_nn = nn.Sequential( nn.Linear(2 * dim_y + dim_c, hidden_actor, bias=False), nn.Tanh(), nn.Linear(hidden_actor, dim_u), nn.Tanh(), # nn.Linear(dim_u, dim_u) ) self.actor_nn_optim = torch.optim.Adam(self.actor_nn.parameters(), lr=actor_nn_lr) #定义critic网络相关:HDP self.critic_nn = nn.Sequential( nn.Linear(dim_y + dim_y + dim_c, hidden_critic, bias=False), nn.Tanh(), nn.Linear(hidden_critic, 1), ) self.critic_nn_optim = torch.optim.Adam(self.critic_nn.parameters(), lr=critic_nn_lr) self.critic_criterion = torch.nn.MSELoss() self.gamma = gamma if indice_y is None: indice_y = [2, 3] if indice_y_star is None: indice_y_star = [0, 1] if indice_u is None: indice_u = [4, 5] self.indice_y = indice_y self.indice_y_star = indice_y_star self.indice_c = [6, 7] self.indice_u = indice_u self.predict_epoch = predict_epoch
import math import Control_Exp1001 as CE import os import json from Control_Exp1001.demo.thickener.adhdp import ADHDP from Control_Exp1001.simulation.thickener import Thickener from Control_Exp1001.common.penaltys.demo_penalty import DemoPenalty import matplotlib.pyplot as plt from Control_Exp1001.demo.thickener.one_round_exp import OneRoundExp from Control_Exp1001.demo.thickener.one_round_evaluation import OneRoundEvaluation from Control_Exp1001.common.action_noise.e_greedy import EGreedy from Control_Exp1001.common.replay.replay_buffer import ReplayBuffer replay_hdp = ReplayBuffer(capacity=20) env_ADHDP = Thickener() exploration = EGreedy(epsilon_start=1, epsilon_final=0.0001, epsilon_decay=300, action_bounds=env_ADHDP.u_bounds) adhdp = ADHDP( replay_buffer=replay_hdp, u_bounds=env_ADHDP.u_bounds, #exploration = None, exploration=exploration, env=env_ADHDP, gamma=0.1, batch_size=10, predict_batch_size=32, critic_nn_error_limit=0.02,
def __init__( self, gpu_id=1, replay_buffer=None, u_bounds=None, exploration=None, env=None, gamma=0.6, batch_size=1, predict_batch_size=32, critic_nn_error_limit=1, actor_nn_error_limit=0.1, actor_nn_lr=0.01, critic_nn_lr=0.01, indice_y=None, indice_u=None, indice_y_star=None, indice_c=None, hidden_critic=10, hidden_actor=10, off_policy=False, Nc=500, Na=500, train_period=100, test_period=1, ): """ :param gpu_id: :param replay_buffer: :param u_bounds: :param exploration: :param env: :param predict_training_rounds: 训练预测模型时使用的真实数据条数 :param Vm: :param Lm: :param Va: :param La: :param Vc: :param Lc: :param gamma: :param batch_size: :param predict_batch_size: 训练预测模型时的batch_size :param model_nn_error_limit: :param critic_nn_error_limit: critic网络的误差限 :param actor_nn_loss: :param u_iter: 求解u*时的迭代次数 :param u_begin: 求解u*时,第一次迭代的其实u(k) :param indice_y: y在state中的位置 :param indice_y_star: *在state中的位置 :param u_first: 第一次控制时的命令 """ super(ADHDP, self).__init__(gpu_id=gpu_id, replay_buffer=replay_buffer, u_bounds=u_bounds, exploration=exploration) if env is None: env = Thickener() self.env = env self.device = None self.cuda_device(gpu_id) self.batch_size = batch_size self.critic_nn_error_limit = critic_nn_error_limit self.actor_nn_error_limit = actor_nn_error_limit dim_c = env.size_yudc[3] dim_y = env.size_yudc[0] dim_u = env.size_yudc[1] #定义actor网络相关 self.actor_nn = nn.Sequential( nn.Linear(2 * dim_y + dim_c, hidden_actor, bias=False), #nn.Tanh(), #nn.Sigmoid(), nn.ReLU(), nn.Linear(hidden_actor, dim_u, bias=False), nn.Tanh(), # nn.Linear(dim_u, dim_u) ) self.actor_nn_optim = torch.optim.SGD(self.actor_nn.parameters(), lr=actor_nn_lr) #定义critic网络相关:HDP self.critic_nn = nn.Sequential( nn.Linear(dim_y + dim_y + dim_c + dim_u, hidden_critic, bias=False), nn.Tanh(), #nn.ReLU(), nn.Linear(hidden_critic, 1, bias=False), ) self.critic_nn_optim = torch.optim.SGD(self.critic_nn.parameters(), lr=critic_nn_lr) self.critic_criterion = torch.nn.MSELoss() self.gamma = gamma if indice_y is None: indice_y = [2, 3] if indice_y_star is None: indice_y_star = [0, 1] if indice_u is None: indice_u = [4, 5] self.indice_y = indice_y self.indice_y_star = indice_y_star self.indice_c = [6, 7] self.indice_u = indice_u self.off_policy = off_policy self.Nc = Nc self.Na = Na self.train_period = train_period self.test_period = test_period
import math import Control_Exp1001 as CE import os import json from Control_Exp1001.demo.thickener.hdp_sample import HDP_sample from Control_Exp1001.simulation.thickener import Thickener from Control_Exp1001.common.penaltys.demo_penalty import DemoPenalty import matplotlib.pyplot as plt from Control_Exp1001.demo.thickener.one_round_exp import OneRoundExp from Control_Exp1001.demo.thickener.one_round_evaluation import OneRoundEvaluation from Control_Exp1001.common.action_noise.e_greedy import EGreedy from Control_Exp1001.common.replay.replay_buffer import ReplayBuffer replay_hdp_sample = ReplayBuffer(capacity=30) env_HDP_sample = Thickener(noise_p=0.01, noise_in=True) exploration = EGreedy(epsilon_start=0.0, epsilon_final=0.0000, epsilon_decay=100, action_bounds=env_HDP_sample.u_bounds) hdp_sample = HDP_sample( replay_buffer=replay_hdp_sample, u_bounds=env_HDP_sample.u_bounds, #exploration = None, exploration=exploration, env=env_HDP_sample, predict_training_rounds=3000, gamma=0.1, batch_size=10, predict_batch_size=32,