def test_action_mapping(self): origin_act = np.array([-1.0, 0.0, 1.0]) mapped_act = action_mapping(origin_act, 0.0, 1.0) self.assertListEqual(list(mapped_act), [0.0, 0.5, 1.0]) mapped_act = action_mapping(origin_act, -2.0, 2.0) self.assertListEqual(list(mapped_act), [-2.0, 0.0, 2.0]) mapped_act = action_mapping(origin_act, -5.0, 10.0) self.assertListEqual(list(mapped_act), [-5.0, 2.5, 10.0])
def run_evaluate_episode(env, agent, render): obs = env.reset() total_reward = 0 episode_goal = np.expand_dims(obs[-3:], axis=0) steps = 0 while MAX_STEPS_PER_EPISODES - steps: steps += 1 batch_obs = np.expand_dims(obs[8:15], axis=0) batch_obs_with_goal = np.concatenate((batch_obs, episode_goal), axis=1) action = agent.predict(batch_obs_with_goal.astype('float32')) action = np.squeeze(action) action = action_mapping(action, env.action_space.low[0], env.action_space.high[0]) next_obs, reward, done, info = env.step(action) if render: env.render() # print(reward) obs = next_obs total_reward += reward if done: break return total_reward
def evaluate(env, agent, render=False): eval_reward = [] for i in range(5): obs = env.reset() total_reward, steps = 0, 0 while True: batch_obs = np.expand_dims(obs, axis=0) pred_action = agent.predict(batch_obs.astype('float32')) pred_action = np.squeeze(pred_action) env_action = pred_action[0] + 0.2 * pred_action[1:] env_action = np.clip(env_action, -1.0, 1.0) env_action = action_mapping(env_action, env.action_space.low[0], env.action_space.high[0]) next_obs, reward, done, info = env.step(env_action) obs = next_obs total_reward += reward steps += 1 if render: env.render() if done: break print("Test episode {}, reward:{}".format(i, total_reward)) eval_reward.append(total_reward) return np.mean(eval_reward)
def evaluate(env, agent): eval_reward = [] for i in range(5): obs = env.reset() total_reward, steps = 0, 0 while True: batch_obs = np.expand_dims(obs, axis=0) pred_action = agent.predict(batch_obs.astype('float32')) pred_action = np.squeeze(pred_action) env_action = pred_action[0] + 0.2 * pred_action[1:] #CHANGE #env_action = np.clip(env_action, -1.0, 1.0) env_action = np.clip( np.random.normal(pred_action, EXPL_NOISE * max_action), -1.0, 1.0) env_action = action_mapping(env_action, env.action_space.low[0], env.action_space.high[0]) next_obs, reward, done, info = env.step(env_action) obs = next_obs total_reward += reward steps += 1 if done: break eval_reward.append(total_reward) return np.mean(eval_reward)
def evaluate1(env, agent ,gm): eval_reward = [] for i in range(5): obs = env.reset() total_reward, steps = 0, 0 while True: batch_obs = np.expand_dims(obs, axis=0) action = agent.predict(batch_obs.astype('float32')) action = np.squeeze(action) mean_a= action[4] #加的代码,还原输出,目的使输出稳定,原因同上。 action = action[0:4] action = gm*action +(1-gm) * mean_a #注意此处的gm,用于变电压浮动的控制 action = np.clip(action, -1.0, 1.0) #加的一行代码,防止报错 action = action_mapping(action, env.action_space.low[0], env.action_space.high[0]) next_obs, reward, done, info = env.step(action) obs = next_obs total_reward += reward steps += 1 if done: break eval_reward.append(total_reward) print("一次评估完成,此时的gm值",gm,"此次的total_reward",total_reward) return np.mean(eval_reward)
def evaluate(env, agent): eval_reward = [] for i in range(5): obs = env.reset() total_reward, steps = 0, 0 while True: batch_obs = np.expand_dims(obs, axis=0) action = agent.predict(batch_obs.astype('float32')) action = np.squeeze(action) mean_a= action[4] #加的代码,还原输出,目的使输出稳定,原因同上。 action = action[0:4] action = GM*action + mean_a #此处我取了一个GM = 0.2的系数,在全局变量里面设置,用于变电压浮动的控制 action = np.clip(action, -1.0, 1.0) #加的一行代码,防止报错 action = action_mapping(action, env.action_space.low[0], env.action_space.high[0]) next_obs, reward, done, info = env.step(action) obs = next_obs total_reward += reward steps += 1 if done: break eval_reward.append(total_reward) return np.mean(eval_reward)
def evaluate(env, agent): eval_reward = [] for i in range(3): obs = env.reset() total_reward, steps = 0, 0 while True: batch_obs = np.expand_dims(obs, axis=0) action = agent.predict(batch_obs.astype('float32')) action = np.squeeze(action) main_action = action[0] sub_action = action[1:] # sub_action = np.random.normal(sub_action, 0.01) action = [main_action+0.2*x for x in sub_action] action = np.clip(action, -1.0, 1.0) action = action_mapping(action, env.action_space.low[0], env.action_space.high[0]) next_obs, reward, done, info = env.step(action) obs = next_obs total_reward += reward steps += 1 if done: break eval_reward.append(total_reward) return np.mean(eval_reward)
def run_episode(env, agent, rpm): total_reward, steps = 0, 0 obs = env.reset() while True: steps += 1 batch_obs = np.expand_dims(obs, axis=0) action = agent.predict(batch_obs.astype("float32")) action = np.squeeze(action) # 增加高斯噪音,并clip action = np.clip(np.random.normal(action, 1.0), -1.0, 1.0) # 将动作映射到对应的电压区间 action = action_mapping(action, env.action_space.low[0], env.action_space.high[0]) # 让四个电压值靠近,这样有意引导容易收敛 means = np.mean(action) action = action + gamma * (means - action) next_obs, reward, done, info = env.step(action) rpm.append(obs, action, REWARD_SCALE * reward, next_obs, done) if rpm.size() > MEMORY_WARMUP_SIZE: batch_obs, batch_action, batch_reward, batch_next_obs, \ batch_terminal = rpm.sample_batch(BATCH_SIZE) critic_cost = agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal) obs = next_obs total_reward += reward if done: break return total_reward, steps
def run_episode(env, agent, rpm, render=False): step = 0 total_reward = 0 obs = env.reset() while True: step += 1 batch_obs = np.expand_dims(obs, axis=0) action = agent.predict(batch_obs) action = np.squeeze(action) action = np.random.normal(action, 1.0) action = np.clip(action, -1.0, 1.0) actuall = action actuall = action_mapping(actuall, env.action_space.low[0], env.action_space.high[0]) next_obs, reward, done, info = env.step(actuall) vx_1 = abs(info['b_v_x'] - info['next_target_g_v_x']) vy_1 = abs(info['b_v_x'] - info['next_target_g_v_y']) vz_1 = abs(info['b_v_x'] - info['next_target_g_v_z']) vx_2 = pow(info['b_v_x'] - info['next_target_g_v_x'], 2) vy_2 = pow(info['b_v_y'] - info['next_target_g_v_y'], 2) vz_2 = pow(info['b_v_z'] - info['next_target_g_v_z'], 2) reward_adept = -0.1 * (vx_1 + vy_1 + vz_1 + vx_2 + vy_2 + vz_2) rpm.append(obs, action, Reward_Scale * reward_adept, next_obs, done) if rpm.size() > Memory_Warm_Up: batch_obs, batch_act, batch_reward, batch_next_obs, batch_done = rpm.sample_batch( Batch_Size) C_cost = agent.learn(batch_obs, batch_act, batch_reward, batch_next_obs, batch_done) obs = next_obs total_reward += reward_adept if render: env.render() if done: break return step, total_reward
def run_train_episode(env, agent, rpm, reward_scale, warmup_size, batch_size, expl_noise): obs = env.reset() total_reward, steps = 0, 0 critic_cost, actor_cost = 0, 0 low_act, high_act = env.action_space.low[0], env.action_space.high[0] while True: steps += 1 batch_obs = np.expand_dims(obs, axis=0) if rpm.size() < warmup_size: action = env.action_space.sample() elif hasattr(agent, "sample_program"): action = agent.sample(batch_obs.astype('float32')) else: action = agent.predict(batch_obs.astype('float32')) action = np.clip(np.random.normal(action, high_act * expl_noise), -high_act, high_act) action = np.clip(action / high_act, -1.0, 1.0) action = action_mapping(action, low_act, high_act) next_obs, reward, done, info = env.step(action) rpm.append(obs, action, reward_scale * reward, next_obs, done) if rpm.size() > warmup_size: batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal = rpm.sample_batch( batch_size) critic_cost, actor_cost = agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal) obs = next_obs total_reward += reward if done: break return total_reward, steps, critic_cost, actor_cost
def evaluate_episode(env, agent, render=False): eval_reward = [] for i in range(5): obs = env.reset() total_reward = 0 target_pose = np.expand_dims(obs[-3:], axis=0) for i in range(EPISODE_LENGTH): batch_obs = np.expand_dims(obs[8:15], axis=0) batch_obs_full = np.concatenate((batch_obs, target_pose), axis=1) action = agent.predict(batch_obs_full.astype('float32')) # Add gripper action again action = np.append(action, 0) action = np.squeeze(action) action = action_mapping(action, env.action_space.low[0], env.action_space.high[0]) next_obs, reward, done, info = env.step(action) obs = next_obs total_reward += reward if render: env.render() if done: break eval_reward.append(total_reward) return np.mean(eval_reward)
def run_episode(env, agent, rpm): obs = env.reset() total_reward, steps = 0, 0 while True: steps += 1 batch_obs = np.expand_dims(obs, axis=0) action = agent.predict(batch_obs.astype('float32')) action = np.squeeze(action) # Add exploration noise, and clip to [-1.0, 1.0] action = np.clip(np.random.normal(action, 1.0), -1.0, 1.0) action = action_mapping(action, env.action_space.low[0], env.action_space.high[0]) next_obs, reward, done, info = env.step(action) rpm.append(obs, action, REWARD_SCALE * reward, next_obs, done) if rpm.size() > MEMORY_WARMUP_SIZE: batch_obs, batch_action, batch_reward, batch_next_obs, \ batch_terminal = rpm.sample_batch(BATCH_SIZE) critic_cost = agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal) obs = next_obs total_reward += reward if done: break return total_reward, steps
def evaluate(env, agent): eval_reward = [] for i in range(5): obs = env.reset() total_reward, steps = 0, 0 while True: if obs.shape[0] == 19: # yaw = obs[14] # pitch = obs[12] # roll = obs[13] next_target_g_v_x = obs[16] next_target_g_v_y = obs[17] next_target_g_v_z = obs[18] # r_matrix = get_rotation_matrix(yaw, pitch, roll) r_matrix = env.simulator.get_coordination_converter_to_body() next_expected_v = np.squeeze( np.matmul( r_matrix, np.array([[next_target_g_v_x], [next_target_g_v_y], [next_target_g_v_z]], dtype="float32"))) obs = np.append(obs, next_expected_v) # extend the obs batch_obs = np.expand_dims(obs, axis=0) action = agent.predict(batch_obs.astype('float32')) action = np.clip(action, -1.0, 1.0) action = np.squeeze(action) # action_main = action[0] # action_diff = action[1:] * OFFSET_SCALAR # action_new = action_diff + action_main # action_new = np.clip(action_new, -1.0, 1.0) action = action_mapping(action, env.action_space.low[0], env.action_space.high[0]) next_obs, reward, done, info = env.step(action) # yaw = next_obs[14] # pitch = next_obs[12] # roll = next_obs[13] next_target_g_v_x = next_obs[16] next_target_g_v_y = next_obs[17] next_target_g_v_z = next_obs[18] # r_matrix = get_rotation_matrix(yaw, pitch, roll) r_matrix = env.simulator.get_coordination_converter_to_body() next_expected_v = np.squeeze( np.matmul( r_matrix, np.array([[next_target_g_v_x], [next_target_g_v_y], [next_target_g_v_z]], dtype="float32"))) next_obs = np.append(next_obs, next_expected_v) # extend the obs obs = next_obs total_reward += reward steps += 1 if done: break env.render() eval_reward.append(total_reward) return np.mean(eval_reward)
def evaluate(env, agent): eval_reward = [] for i in range(5): obs = env.reset() total_reward, steps = 0, 0 while True: batch_obs = np.expand_dims(obs, axis=0) action = agent.predict(batch_obs.astype('float32')) action = np.squeeze(action) action_four = action[0] + 0.2 * action[1:] action_four = np.clip(action_four, -1.0, 1.0) action_four = action_mapping(action_four, env.action_space.low[0], env.action_space.high[0]) next_obs, reward, done, info = env.step(action_four) obs = next_obs total_reward += reward steps += 1 if done: break eval_reward.append(total_reward) return np.mean(eval_reward)
def run_episode(env, agent, rpm): obs = env.reset() total_reward, steps = 0, 0 while True: steps += 1 batch_obs = np.expand_dims(obs, axis=0) action = agent.predict(batch_obs.astype('float32')) action = np.squeeze(action) # 给输出动作增加探索扰动,输出限制在 [-1.0, 1.0] 范围内 action = np.random.normal(action, 1.0) action = np.clip(action, -1.0, 1.0) # 动作映射到对应的 实际动作取值范围 内, action_mapping是从parl.utils那里import进来的函数 action = action_mapping(action, env.action_space.low[0], env.action_space.high[0]) next_obs, reward, done, info = env.step(action) rpm.append(obs, action, REWARD_SCALE * reward, next_obs, done) if rpm.size() > MEMORY_WARMUP_SIZE: batch_obs, batch_action, batch_reward, batch_next_obs, \ batch_terminal = rpm.sample_batch(BATCH_SIZE) critic_cost = agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal) obs = next_obs total_reward += reward if done: break return total_reward, steps
def run_train_episode(env, agent, scaler): obs = env.reset() observes, actions, rewards, unscaled_obs = [], [], [], [] step = 0.0 scale, offset = scaler.get() scale[-1] = 1.0 # don't scale time step feature offset[-1] = 0.0 # don't offset time step feature env.reset() while True: obs = obs.reshape((1, -1)) obs = np.append(obs, [[step]], axis=1) # add time step feature unscaled_obs.append(obs) obs = (obs - offset) * scale # center and scale observations obs = obs.astype('float32') observes.append(obs) action = agent.policy_sample(obs) action = np.clip(action, -1.0, 1.0) action = action_mapping(action, env.action_space.low[0], env.action_space.high[0]) action = action.reshape((1, -1)).astype('float32') env.render() actions.append(action) obs, reward, done, _ = env.step(np.squeeze(action)) rewards.append(reward) step += 1e-3 # increment time step feature if done: break return (np.concatenate(observes), np.concatenate(actions), np.array(rewards, dtype='float32'), np.concatenate(unscaled_obs))
def evaluate_episode(env, agent, render=False): total_reward = [] env_reward = [] for j in range(5): obs = env.reset() c_r = 0 d_r = 0 while True: batch_obs = np.expand_dims(obs, axis=0) action = agent.predict(batch_obs) action = np.squeeze(action) action = np.clip(action, -1.0, 1.0) actuall = action actuall = action_mapping(actuall, env.action_space.low[0], env.action_space.high[0]) next_obs, reward, done, info = env.step(actuall) obs = next_obs vx = pow(info['b_v_x'] - info['next_target_g_v_x'], 2) vy = pow(info['b_v_y'] - info['next_target_g_v_y'], 2) vz = pow(info['b_v_z'] - info['next_target_g_v_z'], 2) reward_adept = 0.01 * (vx + vy + vz) d_r += reward reward -= reward_adept c_r += reward if render: env.render() if done: break total_reward.append(c_r) env_reward.append(d_r) total_reward.append(np.mean(total_reward)) env_reward.append(np.mean(env_reward)) return total_reward, env_reward
def evaluate(env, agent): eval_reward = [] for i in range(5): obs = env.reset() total_reward, steps = 0, 0 while True: batch_obs = np.expand_dims(obs, axis=0) action = agent.predict(batch_obs.astype('float32')) # print("before:",action) action = np.squeeze(action) action = np.array([ action[0] + 0.1 * action[1], action[0] + 0.1 * action[2], action[0] + 0.1 * action[3], action[0] + 0.1 * action[4] ]) # print(action) action = np.clip(action, -1.0, 1.0) action = action_mapping(action, env.action_space.low[0], env.action_space.high[0]) next_obs, reward, done, info = env.step(action) obs = next_obs total_reward += reward steps += 1 env.render() if done: break eval_reward.append(total_reward) return np.mean(eval_reward)
def run_evaluate_episode(env, agent, scaler): obs = env.reset() rewards = [] step = 0.0 scale, offset = scaler.get() scale[-1] = 1.0 # don't scale time step feature offset[-1] = 0.0 # don't offset time step feature while True: obs = obs.reshape((1, -1)) obs = np.append(obs, [[step]], axis=1) # add time step feature obs = (obs - offset) * scale # center and scale observations obs = obs.astype('float32') action = agent.policy_predict(obs) action = action_mapping(action, env.action_space.low[0], env.action_space.high[0]) obs, reward, done, _ = env.step(np.squeeze(action)) env.render() rewards.append(reward) step += 1e-3 # increment time step feature if done: break return np.sum(rewards)
def run_episode(env, agent, rpm, batch_size=64): obs = env.reset() total_reward, steps, a_loss, c_loss = 0, 0, 0, 0 while True: steps += 1 batch_obs = np.expand_dims(obs, axis=0) action = agent.predict(batch_obs.astype('float32')) action = np.squeeze(action) action = np.clip(np.random.normal(action, EXPL_NOISE), -1.0, 1.0) action = action_mapping(action, env.action_space.low[0], env.action_space.high[0]) next_obs, reward, done, info = env.step(action) rpm.append(obs, action, reward, next_obs, done) if rpm.size() > WARMUP_SIZE: batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal = rpm.sample_batch( batch_size) a_loss, c_loss = agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal) obs = next_obs total_reward += reward if done: break return total_reward, steps, a_loss, c_loss
def evaluate(env, agent, render=True): eval_reward = [] for i in range(1): obs = env.reset() total_reward, steps = 0, 0 while True: batch_obs = np.expand_dims(obs, axis=0) action = agent.predict(batch_obs.astype('float32')) action = np.squeeze(action) action = np.clip(action, -1.0, 1.0) action = action_mapping(action, env.action_space.low[0], env.action_space.high[0]) new_action = [0] * (action.shape[0] - 1) for i in range(len(new_action)): new_action[i] = action[0] + 0.3 * action[i + 1] new_action = np.array(new_action) #new_action = action[0] + 0.3*action[1:] next_obs, reward, done, info = env.step(new_action) obs = next_obs total_reward += reward steps += 1 if render: env.render() if done: break eval_reward.append(total_reward) return np.mean(eval_reward)
def evaluate(env, agent, render=False): eval_reward = [] for i in range(5): obs = env.reset() total_reward, steps = 0, 0 while True: batch_obs = np.expand_dims(obs, axis=0) action = agent.predict(batch_obs.astype('float32')) action = np.squeeze(action) action = np.clip(action, -1.0, 1.0) ## special action = action_mapping(action, env.action_space.low[0], env.action_space.high[0]) # action = np.clip(action, -1.0, 1.0) ## special next_obs, reward, done, info = env.step(action) obs = next_obs total_reward += reward steps += 1 if render: env.render() if done: break eval_reward.append(total_reward) return np.mean(eval_reward)
def evaluate(env, agent): eval_reward = [] for i in range(5): obs = env.reset() total_reward, steps = 0, 0 while True: batch_obs = np.expand_dims(obs, axis=0) action = agent.predict(batch_obs) action = np.clip(action, -1.0, 1.0) action = np.squeeze(action) action = action_mapping(action, env.action_space.low[0], env.action_space.high[0]) means = np.mean(action) action = action + gamma * (means - action) next_obs, reward, done, info = env.step(action) obs = next_obs total_reward += reward steps += 1 if done: break eval_reward.append(total_reward) return np.mean(eval_reward)
def evaluate(env, agent): eval_reward = [] for i in range(5): obs = env.reset() total_reward, steps = 0, 0 while True: batch_obs = np.expand_dims(obs, axis=0) action = agent.predict(batch_obs.astype('float32')) #action = [actions[0] * 0.8 + actions[1] * 0.2, actions[0] * 0.8 + actions[2] * 0.2, # actions[0] * 0.8 + actions[3] * 0.2, actions[0] * 0.8 + actions[4] * 0.2] # action = np.squeeze(action) # print("============================",action ) action = np.clip(action, -1.0, 1.0) action = action_mapping(action, env.action_space.low[0], env.action_space.high[0]) next_obs, reward, done, info = env.step(action) obs = next_obs total_reward += reward steps += 1 if done: break eval_reward.append(total_reward) return np.mean(eval_reward)
def run_episode(env, agent, rpm): obs = env.reset() total_reward, steps = 0, 0 while True: steps += 1 batch_obs = np.expand_dims(obs, axis=0) action = agent.predict(batch_obs.astype('float32')) # 给输出动作增加探索扰动 action = np.random.normal(action, 1.0) action = np.squeeze(action) # 动作从 5 个压缩为 4 个 temp = np.zeros((1, 4)) temp_1 = np.array([ [ 1.0, 0.0, 0.0, 0.0 ] ]) temp_2 = np.array([ [ 0.0, 1.0, 0.0, 0.0 ] ]) temp_3 = np.array([ [ 0.0, 0.0, 1.0, 0.0 ] ]) temp_4 = np.array([ [ 0.0, 0.0, 0.0, 1.0 ] ]) temp += list(action)[ 0 ] temp_1 *= list(action)[ 1 ] temp_2 *= list(action)[ 2 ] temp_3 *= list(action)[ 3 ] temp_4 *= list(action)[ 4 ] action_4 = temp + 0.1 * (temp_1 + temp_2 + temp_3 + temp_4) action_4 = np.squeeze(action_4) action_4 = np.squeeze(action_4) action_4 = np.clip(action_4, -1.0, 1.0) # 动作映射到对应的 实际动作取值范围 内, action_mapping是从parl.utils那里import进来的函数 action_4 = action_mapping(action_4, env.action_space.low[ 0 ], env.action_space.high[ 0 ]) next_obs, reward, done, info = env.step(action_4) rpm.append(obs, action, REWARD_SCALE * reward, next_obs, done) if rpm.size() > MEMORY_WARMUP_SIZE: batch_obs, batch_action, batch_reward, batch_next_obs, \ batch_terminal = rpm.sample_batch(BATCH_SIZE) critic_cost = agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal) obs = next_obs total_reward += reward if done: break return total_reward, steps
def evaluate(env, agent, render=False): eval_reward = [ ] for i in range(5): obs = env.reset() total_reward, steps = 0, 0 while True: batch_obs = np.expand_dims(obs, axis=0) action = agent.predict(batch_obs.astype('float32')) action = np.squeeze(action) # 动作从 5 个压缩为 4 个 temp = np.zeros((1, 4)) temp_1 = np.array([ [ 1.0, 0.0, 0.0, 0.0 ] ]) temp_2 = np.array([ [ 0.0, 1.0, 0.0, 0.0 ] ]) temp_3 = np.array([ [ 0.0, 0.0, 1.0, 0.0 ] ]) temp_4 = np.array([ [ 0.0, 0.0, 0.0, 1.0 ] ]) temp += list(action)[ 0 ] temp_1 *= list(action)[ 1 ] temp_2 *= list(action)[ 2 ] temp_3 *= list(action)[ 3 ] temp_4 *= list(action)[ 4 ] action_4 = temp + 0.1 * (temp_1 + temp_2 + temp_3 + temp_4) action_4 = np.squeeze(action_4) action_4 = np.squeeze(action_4) action_4 = np.clip(action_4, -1.0, 1.0) action_4 = action_mapping(action_4, env.action_space.low[ 0 ], env.action_space.high[ 0 ]) next_obs, reward, done, info = env.step(action_4) obs = next_obs total_reward += reward steps += 1 if render: env.render() if done: break eval_reward.append(total_reward) return np.mean(eval_reward)
def run_evaluate_episode(env, agent): obs = env.reset() total_reward = 0 while True: batch_obs = np.expand_dims(obs, axis=0) action = agent.predict(batch_obs.astype('float32')) action = action_mapping(action, env.action_space.low[0], env.action_space.high[0]) next_obs, reward, done, info = env.step(action) obs = next_obs total_reward += reward if done: break return total_reward
def evaluate_episode(env, render=False): env_reward = [] for j in range(5): env.reset() d_r = 0 while True: actuall = np.array([-1, -1, -1, -1], dtype='float32') actuall = action_mapping(actuall, env.action_space.low[0], env.action_space.high[0]) next_obs, reward, done, info = env.step(actuall) d_r += reward if render: env.render() if done: break env_reward.append(d_r) env_reward.append(np.mean(env_reward)) return env_reward
def run_episode(env, agent, rpm, render=False): obs = env.reset() total_reward, steps = 0, 0 while True: steps += 1 batch_obs = np.expand_dims(obs, axis=0) action = agent.predict(batch_obs.astype('float32')) action = np.squeeze(action) # 给输出动作增加探索扰动,输出限制在 [-1.0, 1.0] 范围内 action = np.clip(np.random.normal(action, NOISE), -1.0, 1.0) # action_tmp = action[0] +action[-(len(action)-1):,] * OFFSET_FACTOR; # action = np.append(action[0],action_tmp) # 动作映射到对应的 实际动作取值范围 内, action_mapping是从parl.utils那里import进来的函数 # action = action_mapping(action, env.action_space.low[0], # env.action_space.high[0]) # next_obs, reward, done, info = env.step(action) main_action = action[0] sub_action = action[1:] sub_action = main_action + sub_action * OFFSET_FACTOR sub_action = np.clip(sub_action, -1.0, 1.0) sub_action = action_mapping(sub_action, env.action_space.low[0], env.action_space.high[0]) next_obs, reward, done, info = env.step(sub_action) rpm.append(obs, action, REWARD_SCALE * reward, next_obs, done) if rpm.size() > MEMORY_WARMUP_SIZE: batch_obs, batch_action, batch_reward, batch_next_obs, \ batch_terminal = rpm.sample_batch(BATCH_SIZE) critic_cost = agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal) obs = next_obs total_reward += reward if render: env.render() if done: break return total_reward, steps
def evaluate(env, agent, render=False): obs = env.reset() total_reward = 0 while True: batch_obs = np.expand_dims(obs, axis=0) action = agent.predict(batch_obs.astype('float32')) action = np.squeeze(action) action = np.clip(action, -1.0, 1.0) ## special action = action_mapping(action, env.action_space.low[0], env.action_space.high[0]) next_obs, reward, done, info = env.step(action) obs = next_obs total_reward += reward if done: break return total_reward