def t_main_1(data_manager, plot_time_reward): data_manager = data_manager envManager = EnvironmentManager(data_manager) envManager.auto_create_multi_singleprocess_envs() plot_data = { "time": [], "rewards_max": [], "rewards_mean": [], "reward_min": [] } for i in range(1): Env, params = envManager.next_environment() agent = LSTM(params) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) baseline_reward = 0 a = [0, 1] # init_input:[[0 1]] init_input = np.array(a).reshape(1, 2) start_time = time.time() summary_writter = tf.summary.FileWriter("Mylog/test1", sess.graph) # 训练1000次 for j in range(MainConfig.num_train): x, agr_params, action, _ = agent.getArgParams(sess, init_input) init_input = x[-1] # 不使用神经网络 rewards = Env.run(action) # 5:定义log写入流 summarize(summary_writter, np.max(rewards), j, 'max_reward') summarize(summary_writter, np.mean(rewards), j, 'mean_reward') step_time = time.time() one_time = step_time - start_time plot_data["time"].append(one_time) plot_data["rewards_max"].append(np.max(rewards)) plot_data["rewards_mean"].append(np.mean(rewards)) plot_data["reward_min"].append(np.min(rewards)) if j % 100 == 0: plot = pd.DataFrame(data=plot_data) plot.to_csv(plot_time_reward, index=False) if j == 0: baseline_reward = np.mean(rewards) print("else: normal training, rewards:", rewards) loss, ratio = agent.learn(False, sess, x, agr_params, rewards, baseline_reward, j) print("i=", i, " j=", j, "average_reward=", np.mean(rewards), " baseline_reward=", baseline_reward, " loss=", loss, "\n") summarize(summary_writter, loss, j, 'loss') summarize(summary_writter, ratio, j, 'ratio') reward_c = np.mean(rewards) baseline_reward = baseline_reward * AgentConfig.dr + ( 1 - AgentConfig.dr) * reward_c plot = pd.DataFrame(data=plot_data) plot.to_csv(plot_time_reward, index=False) print("---------训练结束!----------")
def t_main_meta_diff(): # 环境初始化 nnet = NNet(22) data_manager_img = DataManager(6) data_manager_crowd = DataManager(12) data_manager_pr = DataManager(14) data_manager_opt = DataManager(9) envManagerImg = EnvironmentManager(data_manager_img) envManagerCrowd = EnvironmentManager(data_manager_crowd) envManagerPr = EnvironmentManager(data_manager_pr) envManagerOpt = EnvironmentManager(data_manager_opt) envManagerImg.auto_create_multi_singleprocess_envs() envManagerCrowd.auto_create_multi_singleprocess_envs() envManagerPr.auto_create_multi_singleprocess_envs() envManagerOpt.auto_create_multi_singleprocess_envs() env_img, _ = envManagerImg.next_environment() env_crowd, _ = envManagerCrowd.next_environment() env_pr, _ = envManagerPr.next_environment() env_opt, _ = envManagerOpt.next_environment() # 获取image-segmentation数据集的元特征 m = MetaFeatureExtractor(0) meta_vec_img = [ m.num_ins, m.log_num_ins, m.num_feature, m.log_num_feature, m.dimen, m.log_dimen, m.inv_dimen, m.log_inv_dimen, m.kurtosis_min, m.kurtosis_max, m.kurtosis_mean, m.kurtosis_std, m.skewness_min, m.skewness_max, m.skewness_mean, m.skewness_std, m.entropy ] # 使用高斯分布处理元特征 meta_vec_img = (meta_vec_img - np.mean(meta_vec_img)) / np.std(meta_vec_img) meta_vec_img = [meta_vec_img] * 8 meta_vec_img = np.array(meta_vec_img) # 获取Crowdsourced数据集的元特征 m1 = MetaFeatureExtractor(2) meta_vec_crowd = [ m1.num_ins, m1.log_num_ins, m1.num_feature, m1.log_num_feature, m1.dimen, m1.log_dimen, m1.inv_dimen, m1.log_inv_dimen, m1.kurtosis_min, m1.kurtosis_max, m1.kurtosis_mean, m1.kurtosis_std, m1.skewness_min, m1.skewness_max, m1.skewness_mean, m1.skewness_std, m1.entropy ] # 使用高斯分布处理元特征 meta_vec_crowd = (meta_vec_crowd - np.mean(meta_vec_crowd)) / np.std(meta_vec_crowd) meta_vec_crowd = [meta_vec_crowd] * 8 meta_vec_crowd = np.array(meta_vec_crowd) # 获取pr-handwritten数据集的元特征 m2 = MetaFeatureExtractor(3) meta_vec_pr = [ m2.num_ins, m2.log_num_ins, m2.num_feature, m2.log_num_feature, m2.dimen, m2.log_dimen, m2.inv_dimen, m2.log_inv_dimen, m2.kurtosis_min, m2.kurtosis_max, m2.kurtosis_mean, m2.kurtosis_std, m2.skewness_min, m2.skewness_max, m2.skewness_mean, m2.skewness_std, m2.entropy ] # 使用高斯分布处理元特征 meta_vec_pr = (meta_vec_pr - np.mean(meta_vec_pr)) / np.std(meta_vec_pr) meta_vec_pr = [meta_vec_pr] * 8 meta_vec_pr = np.array(meta_vec_pr) # 获取optdigits数据集的元特征 m3 = MetaFeatureExtractor(4) meta_vec_opt = [ m3.num_ins, m3.log_num_ins, m3.num_feature, m3.log_num_feature, m3.dimen, m3.log_dimen, m3.inv_dimen, m3.log_inv_dimen, m3.kurtosis_min, m3.kurtosis_max, m3.kurtosis_mean, m3.kurtosis_std, m3.skewness_min, m3.skewness_max, m3.skewness_mean, m3.skewness_std, m3.entropy ] # 使用高斯分布处理元特征 meta_vec_opt = (meta_vec_opt - np.mean(meta_vec_opt)) / np.std(meta_vec_opt) meta_vec_opt = [meta_vec_opt] * 8 meta_vec_opt = np.array(meta_vec_opt) img_agr = np.loadtxt("../MyValidate_time/test-meta3/img_sample.csv", delimiter=",") crowd_agr = np.loadtxt( "../MyValidate_time/test-meta3/crowdsourced_sample.csv", delimiter=",") pr_agr = np.loadtxt("../MyValidate_time/test-meta3/pr_sample.csv", delimiter=",") opt_agr = np.loadtxt("../MyValidate_time/test-meta3/optdigits_sample.csv", delimiter=",") # 训练预测网络(将4个数据集放在一起进行训练) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # summary_writter = tf.summary.FileWriter(path, sess.graph) # 1 2 3 4数据集训练预测网络 10steps for i in range(4): img_agr_train = np.hstack( (img_agr[8 * i:8 * (i + 1), :5], meta_vec_img)) img_agr_label = img_agr[8 * i:8 * (i + 1), 5] crowd_agr_train = np.hstack( (crowd_agr[8 * i:8 * (i + 1), :5], meta_vec_crowd)) crowd_agr_label = crowd_agr[8 * i:8 * (i + 1), 5] pr_agr_train = np.hstack( (pr_agr[8 * i:8 * (i + 1), :5], meta_vec_pr)) pr_agr_label = pr_agr[8 * i:8 * (i + 1), 5] opt_agr_train = np.hstack( (opt_agr[8 * i:8 * (i + 1), :5], meta_vec_opt)) opt_agr_label = opt_agr[8 * i:8 * (i + 1), 5] nnet.store_transition(img_agr_train, img_agr_label) nnet.store_transition(crowd_agr_train, crowd_agr_label) nnet.store_transition(pr_agr_train, pr_agr_label) nnet.store_transition(opt_agr_train, opt_agr_label) nnet.train_net(sess, i) print('------预测网络训练结束(1 2 3 4数据集)------') # 定义随机动作用于对比实验 # 对数据进行shuffle action_sample = np.loadtxt( "../MyValidate_time/test-meta3/action_sample.csv", delimiter=",") agr_sample = np.loadtxt("../MyValidate_time/test-meta3/agr_sample.csv", delimiter=",") random.shuffle(action_sample) random.shuffle(agr_sample) # 记录所有预测网络reward reward_img_pre_total = [] reward_crowd_pre_total = [] reward_pr_pre_total = [] reward_opt_pre_total = [] # 定义训练轮数 n_step = 2 # 预测网络预测reward start_time = time.time() for i in range(n_step): img_ran_action = np.hstack( (agr_sample[8 * i:8 * (i + 1), :], meta_vec_img)) crowd_ran_action = np.hstack( (agr_sample[8 * i:8 * (i + 1), :], meta_vec_crowd)) pr_ran_action = np.hstack( (agr_sample[8 * i:8 * (i + 1), :], meta_vec_pr)) opt_ran_action = np.hstack( (agr_sample[8 * i:8 * (i + 1), :], meta_vec_opt)) reward_img_pre = nnet.get_reward(sess, img_ran_action) reward_crowd_pre = nnet.get_reward(sess, crowd_ran_action) reward_pr_pre = nnet.get_reward(sess, pr_ran_action) reward_opt_pre = nnet.get_reward(sess, opt_ran_action) reward_img_pre_total.append(reward_img_pre) reward_crowd_pre_total.append(reward_crowd_pre) reward_pr_pre_total.append(reward_pr_pre) reward_opt_pre_total.append(reward_opt_pre) step_time = time.time() pre_time = step_time - start_time print('预测网络耗时:', pre_time) print('------预测网络预测reward结束------') # 记录所有真实环境reward reward_img_true_total = [] reward_crowd_true_total = [] reward_pr_true_total = [] reward_opt_true_total = [] # 真实环境测试 start_time = time.time() for i in range(n_step): reward_img_true = env_img.run(action_sample[8 * i:8 * (i + 1), :]) reward_crowd_true = env_crowd.run(action_sample[8 * i:8 * (i + 1), :]) reward_pr_true = env_pr.run(action_sample[8 * i:8 * (i + 1), :]) reward_opt_true = env_opt.run(action_sample[8 * i:8 * (i + 1), :]) reward_img_true_total.append(reward_img_true) reward_crowd_true_total.append(reward_crowd_true) reward_pr_true_total.append(reward_pr_true) reward_opt_true_total.append(reward_opt_true) step_time = time.time() true_time = step_time - start_time print('真实环境耗时:', true_time) print('------真实环境获得reward结束------') reward_img_pre_total = np.array(reward_img_pre_total).reshape( n_step, 8) reward_img_true_total = np.array(reward_img_true_total).reshape( n_step, 8) reward_crowd_pre_total = np.array(reward_crowd_pre_total).reshape( n_step, 8) reward_crowd_true_total = np.array(reward_crowd_true_total).reshape( n_step, 8) reward_pr_pre_total = np.array(reward_pr_pre_total).reshape(n_step, 8) reward_pr_true_total = np.array(reward_pr_true_total).reshape( n_step, 8) reward_opt_pre_total = np.array(reward_opt_pre_total).reshape( n_step, 8) reward_opt_true_total = np.array(reward_opt_true_total).reshape( n_step, 8) # 计算真实reward与预测网络预测的reward的距离 dis_pre = np.square( np.mean(reward_img_pre_total - reward_img_true_total)) dis_crowd = np.square( np.mean(reward_crowd_pre_total - reward_crowd_true_total)) dis_pr = np.square(np.mean(reward_pr_pre_total - reward_pr_true_total)) dis_opt = np.square( np.mean(reward_opt_pre_total - reward_opt_true_total)) print('数据集1的误差为:', dis_pre) print('数据集2的误差为:', dis_crowd) print('数据集3的误差为:', dis_pr) print('数据集4的误差为:', dis_opt) print('------实验结束------')
def t_main_pre(data_manager, plot_time_reward): data_manager = data_manager m = MetaFeatureExtractor(4) meta_vec = [ m.num_ins, m.log_num_ins, m.num_feature, m.log_num_feature, m.dimen, m.log_dimen, m.inv_dimen, m.log_inv_dimen, m.kurtosis_min, m.kurtosis_max, m.kurtosis_mean, m.kurtosis_std, m.skewness_min, m.skewness_max, m.skewness_mean, m.skewness_std, m.entropy ] meta_vec = [meta_vec] * 8 meta_vec = np.array(meta_vec) envManager = EnvironmentManager(data_manager) envManager.auto_create_multi_singleprocess_envs() plot_data = { "time": [], "rewards_max": [], "rewards_mean": [], "reward_min": [] } for i in range(1): # 当前Env为随机森林,params表示该算法超参数预选值维度 如[12 10 21 21 9] Env, params = envManager.next_environment() agent = LSTM(params) nnet = NNet(len(params) + 17) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) baseline_reward = 0 a = [0, 1] init_input = np.array(a).reshape(1, 2) start_time = time.time() summary_writter = tf.summary.FileWriter("Mylog/test-meta2", sess.graph) # 训练1000次 for j in range(5): # init_input:[[0 1]] # action为选取到的动作,agr_params为对应的正态分布下的值 x, agr_params, action, _ = agent.getArgParams(sess, init_input) data1 = pd.DataFrame(agr_params) data2 = pd.DataFrame(action) data1.to_csv('../MyValidate_time/test-meta3/agr_sample.csv', index=False, header=False, mode='a') data2.to_csv('../MyValidate_time/test-meta3/action_sample.csv', index=False, header=False, mode='a') # print('本次得到的action为:',action) # stack_data = np.hstack((agr_params, meta_vec)) # # 使用神经网络 # # 前25轮训练预测网络 # if j <= MainConfig.t1: # rewards = Env.run(action) # # 将样本加入到文件中 # data1 = pd.DataFrame(np.c_[agr_params, rewards]) # # data1.to_csv('../MyValidate_time/test-meta3/pr_sample.csv', index=False, header=False, mode='a') # # data1.to_csv('../MyValidate_time/test-meta3/img_sample.csv', index=False, header=False, mode='a') # # data1.to_csv('../MyValidate_time/test-meta3/crowdsourced_sample.csv', index=False, header=False, # # mode='a') # data1.to_csv('../MyValidate_time/test-meta3/optdigits_sample.csv', index=False, header=False,mode='a') # summarize(summary_writter, np.max(rewards), j, 'max_reward') # summarize(summary_writter, np.mean(rewards), j, 'mean_reward') # step_time = time.time() # one_time = step_time - start_time # plot_data["time"].append(one_time) # plot_data["rewards_max"].append(np.max(rewards)) # plot_data["rewards_mean"].append(np.mean(rewards)) # plot_data["reward_min"].append(np.min(rewards)) # nnet.store_transition(stack_data, rewards) # nnet.train_net(sess, j) # nnet.train_net(sess, j) # if j > MainConfig.t1 and j < MainConfig.t2: # rewards = nnet.get_reward(sess, stack_data) # rewards = np.array(rewards).reshape(AgentConfig.batch_size) # summarize(summary_writter, np.max(rewards), j, 'max_reward') # summarize(summary_writter, np.mean(rewards), j, 'mean_reward') # if j >= MainConfig.t2: # rewards = Env.run(action) # step_time = time.time() # one_time = step_time - start_time # plot_data["time"].append(one_time) # plot_data["rewards_max"].append(np.max(rewards)) # plot_data["rewards_mean"].append(np.mean(rewards)) # plot_data["reward_min"].append(np.min(rewards)) # summarize(summary_writter, np.max(rewards), j, 'max_reward') # summarize(summary_writter, np.mean(rewards), j, 'mean_reward') # if j % 100 == 0: # plot = pd.DataFrame(data=plot_data) # plot.to_csv(plot_time_reward, index=False) # if j == 0: # baseline_reward = np.mean(rewards) # print("else: normal training, rewards:", rewards) # loss, ratio = agent.learn(False, sess, x, agr_params, rewards, baseline_reward, j) # print("i=", i, " j=", j, "average_reward=", np.mean(rewards), " baseline_reward=", baseline_reward, # " loss=", loss, "\n") # summarize(summary_writter, loss, j, 'loss') # summarize(summary_writter, ratio, j, 'ratio') # reward_c = np.mean(rewards) # baseline_reward = baseline_reward * AgentConfig.dr + (1 - AgentConfig.dr) * reward_c # 存储300次 每次的实时reward和time # plot = pd.DataFrame(data=plot_data) # plot.to_csv(plot_time_reward, index=False) print("---------训练结束!----------")
def t_main_no_meta_diff(): # 环境初始化 nnet = NNet(5) data_manager_img = DataManager(6) data_manager_crowd = DataManager(12) data_manager_pr = DataManager(14) data_manager_opt = DataManager(9) envManagerImg = EnvironmentManager(data_manager_img) envManagerCrowd = EnvironmentManager(data_manager_crowd) envManagerPr = EnvironmentManager(data_manager_pr) envManagerOpt = EnvironmentManager(data_manager_opt) envManagerImg.auto_create_multi_singleprocess_envs() envManagerCrowd.auto_create_multi_singleprocess_envs() envManagerPr.auto_create_multi_singleprocess_envs() envManagerOpt.auto_create_multi_singleprocess_envs() env_img, _ = envManagerImg.next_environment() env_crowd, _ = envManagerCrowd.next_environment() env_pr, _ = envManagerPr.next_environment() env_opt, _ = envManagerOpt.next_environment() img_agr = np.loadtxt("../MyValidate_time/test-meta3/img_sample.csv", delimiter=",") crowd_agr = np.loadtxt( "../MyValidate_time/test-meta3/crowdsourced_sample.csv", delimiter=",") pr_agr = np.loadtxt("../MyValidate_time/test-meta3/pr_sample.csv", delimiter=",") opt_agr = np.loadtxt("../MyValidate_time/test-meta3/optdigits_sample.csv", delimiter=",") # 训练预测网络(将四个数据集放在一起进行训练) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # summary_writter = tf.summary.FileWriter(path, sess.graph) # 1 2 3 4数据集训练预测网络 10steps for i in range(4): img_agr_train = img_agr[8 * i:8 * (i + 1), :5] img_agr_label = img_agr[8 * i:8 * (i + 1), 5] crowd_agr_train = crowd_agr[8 * i:8 * (i + 1), :5] crowd_agr_label = crowd_agr[8 * i:8 * (i + 1), 5] pr_agr_train = pr_agr[8 * i:8 * (i + 1), :5] pr_agr_label = pr_agr[8 * i:8 * (i + 1), 5] opt_agr_train = opt_agr[8 * i:8 * (i + 1), :5] opt_agr_label = opt_agr[8 * i:8 * (i + 1), 5] nnet.store_transition(img_agr_train, img_agr_label) nnet.store_transition(crowd_agr_train, crowd_agr_label) nnet.store_transition(pr_agr_train, pr_agr_label) nnet.store_transition(opt_agr_train, opt_agr_label) nnet.train_net(sess, i) print('------预测网络训练结束(1 2 3 4数据集)------') # 定义随机动作用于对比实验 # 对数据进行shuffle action_sample = np.loadtxt( "../MyValidate_time/test-meta3/action_sample.csv", delimiter=",") agr_sample = np.loadtxt("../MyValidate_time/test-meta3/agr_sample.csv", delimiter=",") random.shuffle(action_sample) random.shuffle(agr_sample) # 记录所有预测网络reward reward_img_pre_total = [] reward_crowd_pre_total = [] reward_pr_pre_total = [] reward_opt_pre_total = [] # 定义训练轮数 n_step = 2 # 预测网络预测reward start_time = time.time() for i in range(n_step): img_ran_action = agr_sample[8 * i:8 * (i + 1), :] crowd_ran_action = agr_sample[8 * i:8 * (i + 1), :] pr_ran_action = agr_sample[8 * i:8 * (i + 1), :] opt_ran_action = agr_sample[8 * i:8 * (i + 1), :] reward_img_pre = nnet.get_reward(sess, img_ran_action) reward_crowd_pre = nnet.get_reward(sess, crowd_ran_action) reward_pr_pre = nnet.get_reward(sess, pr_ran_action) reward_opt_pre = nnet.get_reward(sess, opt_ran_action) reward_img_pre_total.append(reward_img_pre) reward_crowd_pre_total.append(reward_crowd_pre) reward_pr_pre_total.append(reward_pr_pre) reward_opt_pre_total.append(reward_opt_pre) step_time = time.time() pre_time = step_time - start_time print('预测网络耗时:', pre_time) print('------预测网络预测reward结束------') # 记录所有真实环境reward reward_img_true_total = [] reward_crowd_true_total = [] reward_pr_true_total = [] reward_opt_true_total = [] # 真实环境测试 start_time = time.time() for i in range(n_step): reward_img_true = env_img.run(action_sample[8 * i:8 * (i + 1), :]) reward_crowd_true = env_crowd.run(action_sample[8 * i:8 * (i + 1), :]) reward_pr_true = env_pr.run(action_sample[8 * i:8 * (i + 1), :]) reward_opt_true = env_opt.run(action_sample[8 * i:8 * (i + 1), :]) reward_img_true_total.append(reward_img_true) reward_crowd_true_total.append(reward_crowd_true) reward_pr_true_total.append(reward_pr_true) reward_opt_true_total.append(reward_opt_true) step_time = time.time() true_time = step_time - start_time print('真实环境耗时:', true_time) print('------真实环境获得reward结束------') reward_img_pre_total = np.array(reward_img_pre_total).reshape( n_step, 8) reward_img_true_total = np.array(reward_img_true_total).reshape( n_step, 8) reward_crowd_pre_total = np.array(reward_crowd_pre_total).reshape( n_step, 8) reward_crowd_true_total = np.array(reward_crowd_true_total).reshape( n_step, 8) reward_pr_pre_total = np.array(reward_pr_pre_total).reshape(n_step, 8) reward_pr_true_total = np.array(reward_pr_true_total).reshape( n_step, 8) reward_opt_pre_total = np.array(reward_opt_pre_total).reshape( n_step, 8) reward_opt_true_total = np.array(reward_opt_true_total).reshape( n_step, 8) # 计算真实reward与预测网络预测的reward的距离 dis_pre = np.square( np.mean(reward_img_pre_total - reward_img_true_total)) dis_crowd = np.square( np.mean(reward_crowd_pre_total - reward_crowd_true_total)) dis_pr = np.square(np.mean(reward_pr_pre_total - reward_pr_true_total)) dis_opt = np.square( np.mean(reward_opt_pre_total - reward_opt_true_total)) print('数据集1的误差为:', dis_pre) print('数据集2的误差为:', dis_crowd) print('数据集3的误差为:', dis_pr) print('数据集4的误差为:', dis_opt) print('------实验结束------')
def t_main_bp_pre(data_manager, plot_time_reward): data_manager = data_manager envManager = EnvironmentManager(data_manager) envManager.auto_create_multi_singleprocess_envs() plot_data = { "time": [], "rewards_max": [], "rewards_mean": [], "reward_min": [] } real_data = {"reward": [], "action": []} pre_data = {"reward": [], "action": []} for i in range(1): Env, params = envManager.next_environment() agent = LSTM(params) nnet = NNet(len(params)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) baseline_reward = 0 a = [0, 1] init_input = np.array(a).reshape(1, 2) start_time = time.time() for i in range(1): nnet._init_net(sess, len(params)) for j in range(MainConfig.num_train): x, agr_params, action, _ = agent.getArgParams( sess, init_input) np.set_printoptions(suppress=True) # 使用神经网络 rewards = Env.run(action) for ii in range(8): real_data["action"].append(agr_params[ii]) real_data["reward"].append(rewards[ii]) np.savetxt( "../validate_time/params_data_agent(chen)/real_data_action.csv", real_data["action"], delimiter=',') np.savetxt( "../validate_time/params_data_agent(chen)/real_data_reward.csv", real_data["reward"], delimiter=',') step_time = time.time() one_time = step_time - start_time plot_data["time"].append(one_time) plot_data["rewards_max"].append(np.max(rewards)) plot_data["rewards_mean"].append(np.mean(rewards)) plot_data["reward_min"].append(np.min(rewards)) nnet.store_transition(agr_params, rewards) nnet.train_net(sess, j) nnet.train_net(sess, j) if j % MainConfig.num_train - 1 == 0: plot = pd.DataFrame(data=plot_data) plot.to_csv(plot_time_reward, index=False) agent.check_topData(x, agr_params, rewards) if j == 0: baseline_reward = np.mean(rewards) # 每十步 讲使用引导数据池 即最好的reward的超参数值 用于减少方差 if (j + 1) % 10 == 0: x, agr_params, rewards = agent.getInput() print("if: algorithm rectify, rewards:", np.array(rewards).flatten()) loss, ratio = agent.learn(True, sess, x, agr_params, rewards, baseline_reward, j) print("i=", i, " j=", j, "average_reward=", np.mean(rewards), " baseline_reward=", baseline_reward, " loss=", loss, "\n") else: print("else: normal training, rewards:", rewards) loss, ratio = agent.learn(False, sess, x, agr_params, rewards, baseline_reward, j) print("i=", i, " j=", j, "average_reward=", np.mean(rewards), " baseline_reward=", baseline_reward, " loss=", loss, "\n") reward_c = np.mean(rewards) baseline_reward = baseline_reward * AgentConfig.dr + ( 1 - AgentConfig.dr) * reward_c for j in range(MainConfig.pre_num): x, agr_params, action, _ = agent.getArgParams( sess, init_input) np.set_printoptions(suppress=True) # 使用model rewards = nnet.get_reward(sess, agr_params) for ii in range(8): pre_data["action"].append(agr_params[ii]) pre_data["reward"].append(rewards[ii]) # if j % (MainConfig.pre_num-1) == 0: np.savetxt( "../validate_time/params_data_agent(chen)/pre_data_action.csv", pre_data["action"], delimiter=',') np.savetxt( "../validate_time/params_data_agent(chen)/pre_data_reward.csv", pre_data["reward"], delimiter=',') rewards = np.array(rewards).reshape(AgentConfig.batch_size) loss, ratio = agent.learn(False, sess, x, agr_params, rewards, baseline_reward, j) reward_c = np.mean(rewards) baseline_reward = baseline_reward * AgentConfig.dr + ( 1 - AgentConfig.dr) * reward_c #存储300次 每次的实时reward和time plot = pd.DataFrame(data=plot_data) plot.to_csv(plot_time_reward, index=False) print("---------训练结束!----------")
def t_main_bp_pre_test(data_manager,plot_time_reward): data_manager = data_manager envManager = EnvironmentManager(data_manager) envManager.auto_create_multi_singleprocess_envs() plot_data = {"time": [], "rewards_max": [], "rewards_mean": [], "reward_min": []} for i in range(1): Env, params = envManager.next_environment() agent = LSTM(params) nnet = NNet(len(params)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) baseline_reward = 0 a = [0, 1] init_input = np.array(a).reshape(1, 2) start_time = time.time() summary_writter_reward = tf.summary.FileWriter("log/reward", sess.graph) summary_writter_reward_real = tf.summary.FileWriter("log/reward_real", sess.graph) summary_writter_baseline = tf.summary.FileWriter("log/baseline", sess.graph) summary_writter_baseline_real = tf.summary.FileWriter("log/baseline_real", sess.graph) summary_writter_err = tf.summary.FileWriter("log/err", sess.graph) for j in range(MainConfig.num_train): x, agr_params, action, _ = agent.getArgParams(sess, init_input) # 使用神经网络 if j <= 25: rewards = Env.run(action) nnet.store_transition(agr_params, rewards) nnet.train_net(sess, j) nnet.train_net(sess, j) rewards_real = rewards if j > 25 : rewards = nnet.get_reward(sess, agr_params) rewards = np.array(rewards).reshape(AgentConfig.batch_size) rewards_real = Env.run(action) summarize(summary_writter_reward, np.mean(rewards), j, "reward") summarize(summary_writter_reward_real, np.mean(rewards_real), j, "reward") # if j >= 150: # rewards = Env.run(action) # 记录下top 级的数据,等到合适的时候再放进模型训练; # if j <= 25 or j >= 150: # agent.check_topData(x, agr_params, rewards) if j == 0: baseline_reward = np.mean(rewards) baseline_reward_real = np.mean(rewards_real) # # 每十步 讲使用引导数据池 即最好的reward的超参数值 用于减少方差 # if (j + 1) % 10 == 0: # x, agr_params, rewards = agent.getInput() # print("if: algorithm rectify, rewards:", np.array(rewards).flatten()) # loss, ratio = agent.learn(True, sess, x, agr_params, rewards, baseline_reward, j) # print("i=", i, " j=", j, "average_reward=", np.mean(rewards), " baseline_reward=", # baseline_reward, # " loss=", loss, "\n") # else: print("else: normal training, rewards:", rewards) loss, ratio = agent.learn(False, sess, x, agr_params, rewards, baseline_reward, j) print("i=", i, " j=", j, "average_reward=", np.mean(rewards), " baseline_reward=", baseline_reward, " loss=", loss, "\n") reward_c = np.mean(rewards) reward_c_real = np.mean(rewards_real) baseline_reward = baseline_reward * AgentConfig.dr + (1-AgentConfig.dr) * reward_c baseline_reward_real = baseline_reward_real * AgentConfig.dr + (1 - AgentConfig.dr) * reward_c_real err_all = (reward_c - np.mean(baseline_reward)) - (reward_c_real - np.mean(baseline_reward_real)) summarize(summary_writter_baseline, np.mean(baseline_reward), j, "baseline") summarize(summary_writter_baseline_real, np.mean(baseline_reward_real), j, "baseline") summarize(summary_writter_err, np.mean(err_all), j, "err") #存储300次 每次的实时reward和time plot = pd.DataFrame(data=plot_data) plot.to_csv(plot_time_reward, index=False) print("---------训练结束!----------")