def main(): args = arg_parse_TRPO() env = gym.make(args.env_name) env.seed(args.seed) torch.manual_seed(args.seed) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.shape[0] running_state = ZFilter((num_inputs, ), clip=5) running_reward = ZFilter((1, ), demean=False, clip=10) policy_net = Policy(num_inputs, num_actions) value_net = Value(num_inputs) train(args, env, policy_net, value_net, running_state)
def __init__(self, env, args): self.env = env self.args = args # define the network self.net = Network(self.env.observation_space.shape[0], self.env.action_space.shape[0]) self.old_net = Network(self.env.observation_space.shape[0], self.env.action_space.shape[0]) # make sure the net and old net have the same parameters self.old_net.load_state_dict(self.net.state_dict()) # define the optimizer self.optimizer = torch.optim.Adam(self.net.critic.parameters(), lr=self.args.lr) # define the running mean filter self.running_state = ZFilter((self.env.observation_space.shape[0], ), clip=5) if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) self.model_path = self.args.save_dir + self.args.env_name if not os.path.exists(self.model_path): os.mkdir(self.model_path) self.start_episode = 0
def sim_episode(env, policy, max_episode_steps, result_writer): """Simulate an episode and store the resulting render.""" try: running_state = ZFilter((env.observation_space.shape[0], ), clip=5) state = env.reset() state = running_state(state) frames_store = [] for t in range( max_episode_steps): # Don't infinite loop while learning # Simulates one episode, i.e., until the agent reaches the terminal state or has taken 10000 steps in the environment action_mean, action_log_std, action_std = policy( Variable(torch.Tensor([state]))) action = torch.normal(action_mean, action_std).detach().data[0].cpu().numpy() next_state, reward, done, _ = env.step(action) next_state = running_state(next_state) state = next_state frames = env.render('rgb_array') frames_store.append(frames) return frames_store except Exception as e: print(f'Tried running simulation, but got error:{e}') return []
def __init__(self, envs, args, net, env_type='atari'): self.envs = envs self.args = args self.env_type = env_type # define the newtork... self.net = net self.old_net = copy.deepcopy(self.net) # if use the cuda... if self.args.cuda: self.net.cuda() self.old_net.cuda() # define the optimizer... self.optimizer = optim.Adam(self.net.parameters(), self.args.lr, eps=self.args.eps) # running filter... if self.env_type == 'mujoco': num_states = self.envs.observation_space.shape[0] self.running_state = ZFilter((num_states, ), clip=5) # check saving folder.. if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) # env folder.. self.model_path = os.path.join(self.args.save_dir, self.args.env_name) if not os.path.exists(self.model_path): os.mkdir(self.model_path) # get the observation self.batch_ob_shape = (self.args.num_workers * self.args.nsteps, ) + self.envs.observation_space.shape self.obs = np.zeros((self.args.num_workers, ) + self.envs.observation_space.shape, dtype=self.envs.observation_space.dtype.name) if self.env_type == 'mujoco': self.obs[:] = np.expand_dims(self.running_state(self.envs.reset()), 0) else: self.obs[:] = self.envs.reset() self.dones = [False for _ in range(self.args.num_workers)]
def sample(self, policy, params=None, gamma=0.95, device='cpu'): episodes = BatchEpisodes(batch_size=self.batch_size, gamma=gamma, device=device) for i in range(self.batch_size): self.queue.put(i) for _ in range(self.num_workers): self.queue.put(None) observations, batch_ids = self.envs.reset() running_state = ZFilter((observations.shape[1], ), clip=5) for index in range(observations.shape[0]): observations[index, :] = running_state(observations[index, :]) dones = [False] while (not all(dones)) or (not self.queue.empty()): with torch.no_grad(): observations_tensor = torch.from_numpy(observations).to( device=device) actions_tensor = policy(observations_tensor, params=params).sample() actions = actions_tensor.cpu().numpy() new_observations, rewards, dones, new_batch_ids, _ = self.envs.step( actions) episodes.append(observations, actions, rewards, batch_ids) observations, batch_ids = new_observations, new_batch_ids for index in range(observations.shape[0]): observations[index, :] = running_state(observations[index, :]) return episodes
def __init__(self, env, policy_network, value_network, gamma=0.99, policy_lr=0.0005, value_lr=0.0005, tau=0.95, value_update_step=10, policy_update_step=10, epsilon=0.2, batch_size=64, use_cuda=True, saved_path='saved_models/Walker2d-v1/'): # define the parameters... self.env = env self.gamma = gamma self.policy_lr = policy_lr self.value_lr = value_lr self.tau = tau self.value_update_step = value_update_step self.policy_update_step = policy_update_step self.epsilon = epsilon self.batch_size = batch_size self.saved_path = saved_path # check if cuda is avaiable... self.use_cuda = torch.cuda.is_available() and use_cuda print('The cuda is avaiable: ' + str(self.use_cuda)) # define the network... self.policy_network = policy_network self.value_network = value_network if self.use_cuda: self.policy_network.cuda() self.value_network.cuda() # define the optimizer self.optimizer_value = torch.optim.Adam(self.value_network.parameters(), lr=self.value_lr) self.optimizer_policy = torch.optim.Adam(self.policy_network.parameters(), lr=self.policy_lr) # init the Filter... self.running_state = ZFilter((self.env.observation_space.shape[0],), clip=5)
def __init__(self, env, policy_lr, value_lr, tau, gamma, buffer_size, max_time_step, observate_time, batch_size, path, soft_update_step, use_cuda): self.env = env self.policy_lr = policy_lr self.value_lr = value_lr self.use_cuda = bool(use_cuda) self.tau = tau self.gamma = gamma self.buffer_size = buffer_size self.max_time_step = max_time_step self.observate_time = observate_time self.batch_size = batch_size self.global_time_step = 0 self.path = path self.soft_update_step = soft_update_step print('IF USE CUDA: ' + str(self.use_cuda)) num_inputs = self.env.observation_space.shape[0] self.num_actions = self.env.action_space.shape[0] # the scale of the action space.... self.action_scale = self.env.action_space.high[0] # build up the network.... # build the actor_network firstly... self.actor_net = models.Policy(num_inputs, self.num_actions) self.actor_target_net = models.Policy(num_inputs, self.num_actions) # build the critic_network.... self.critic_net = models.Critic(num_inputs, self.num_actions) self.critic_target_net = models.Critic(num_inputs, self.num_actions) # if use cuda... if self.use_cuda: self.actor_net.cuda() self.actor_target_net.cuda() self.critic_net.cuda() self.critic_target_net.cuda() # init the same parameters.... self.actor_target_net.load_state_dict(self.actor_net.state_dict()) self.critic_target_net.load_state_dict(self.critic_net.state_dict()) # define the optimize.... add the L2 reg in critic optimzier here... self.optimizer_actor = torch.optim.Adam(self.actor_net.parameters(), lr=self.policy_lr) self.optimizer_critic = torch.optim.Adam(self.critic_net.parameters(), lr=self.value_lr, weight_decay=1e-2) # init the filter... self.running_state = ZFilter((num_inputs, ), clip=5)
def __init__(self, args, env): # define the arguments and environments... self.args = args self.env = env # define the num of inputs and num of actions num_inputs = self.env.observation_space.shape[0] num_actions = self.env.action_space.shape[0] # define the model save dir... self.saved_path = self.args.save_dir + self.args.env_name + '/' # check the path if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) if not os.path.exists(self.saved_path): os.mkdir(self.saved_path) # define the networks... self.policy_network = models.Policy(num_inputs, num_actions) self.value_network = models.Value(num_inputs) # define the optimizer self.optimizer_value = torch.optim.Adam(self.value_network.parameters(), lr=self.args.value_lr, weight_decay=self.args.l2_reg) # init the filter... self.running_state = ZFilter((num_inputs,), clip=5)
def __init__(self, env, args): # define the parameters... self.env = env # get the environment's input size and output size num_inputs = self.env.observation_space.shape[0] num_actions = self.env.action_space.shape[0] # get the parameters self.args = args self.saved_path = 'saved_models/' + str(self.args.env_name) + '/' # check the path if not os.path.exists(self.saved_path): os.mkdir(self.saved_path) # check if cuda is avaiable... self.use_cuda = torch.cuda.is_available() and self.args.cuda print('The cuda is avaiable: ' + str(torch.cuda.is_available())) print('If use the cuda: ' + str(self.args.cuda)) # define the network... self.policy_network = models.Policy(num_inputs, num_actions) self.value_network = models.Value(num_inputs) if self.use_cuda: self.policy_network.cuda() self.value_network.cuda() # define the optimizer self.optimizer_value = torch.optim.Adam( self.value_network.parameters(), lr=self.args.value_lr, weight_decay=self.args.l2_reg) self.optimizer_policy = torch.optim.Adam( self.policy_network.parameters(), lr=self.args.policy_lr, weight_decay=self.args.l2_reg) # init the Filter... self.running_state = ZFilter((num_inputs, ), clip=5)
return action_loss.mean() def get_kl(): mean1, log_std1, std1 = policy_net(Variable(states)) mean0 = Variable(mean1.data) log_std0 = Variable(log_std1.data) std0 = Variable(std1.data) kl = log_std1 - log_std0 + (std0.pow(2) + (mean0 - mean1).pow(2)) / ( 2.0 * std1.pow(2)) - 0.5 return kl.sum(1, keepdim=True) trpo_step(policy_net, get_loss, get_kl, args.max_kl, args.damping) running_state = ZFilter((num_inputs, ), clip=5) running_reward = ZFilter((1, ), demean=False, clip=10) for i_episode in count(1): memory = Memory() num_steps = 0 reward_batch = 0 num_episodes = 0 while num_steps < args.batch_size: state = env.reset() state = running_state(state) reward_sum = 0 for t in range(10000): # Don't infinite loop while learning action = select_action(state)
def main(gamma=0.995, env_name='Walker2d-v2', tau=0.97, seed=543, number_of_batches=500,\ batch_size=5000, maximum_steps=10000, render=False, log_interval=1, entropy_coeff=0.0,\ clip_epsilon=0.2, use_joint_pol_val=False): torch.set_default_tensor_type('torch.DoubleTensor') PI = torch.DoubleTensor([3.1415926]) env = gym.make(env_name) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.shape[0] env.seed(seed) torch.manual_seed(seed) policy_net = Policy(num_inputs, num_actions) value_net = Value(num_inputs) opt_policy = optim.Adam(policy_net.parameters(), lr=0.001) opt_value = optim.Adam(value_net.parameters(), lr=0.001) running_state = ZFilter((num_inputs,), clip=5) running_reward = ZFilter((1,), demean=False, clip=10) episode_lengths = [] plot_rew = [] for i_episode in range(number_of_batches): memory = Memory() num_steps = 0 reward_batch = 0 num_episodes = 0 while num_steps < batch_size: state = env.reset() state = running_state(state) reward_sum = 0 for t in range(maximum_steps): # Don't infinite loop while learning action = select_action(state, policy_net) action = action.data[0].numpy() next_state, reward, done, _ = env.step(action) reward_sum += reward next_state = running_state(next_state) mask = 1 if done: mask = 0 memory.push(state, np.array([action]), mask, next_state, reward) if render: env.render() if done: break state = next_state num_steps += (t-1) num_episodes += 1 reward_batch += reward_sum reward_batch /= num_episodes batch = memory.sample() plot_rew.append(reward_batch) update_params(batch, policy_net, value_net, gamma, opt_policy, opt_value) if i_episode % args.log_interval == 0: print('Episode {}\tLast reward: {}\tAverage reward {:.2f}'.format( i_episode, reward_sum, reward_batch)) plot_epi = [] for i in range (number_of_batches): plot_epi.append(i) trace = go.Scatter( x = plot_epi, y = plot_rew) layout = go.Layout(title='PPO',xaxis=dict(title='Episodes', titlefont=dict(family='Courier New, monospace',size=18,color='#7f7f7f')), yaxis=dict(title='Average Reward', titlefont=dict(family='Courier New, monospace',size=18,color='#7f7f7f'))) plotly.offline.plot({"data": [trace], "layout": layout},filename='PPO.html',image='jpeg')
def main(gamma=0.995, env_name="Walker2d-v2", tau=0.97, number_of_batches=500,\ batch_size=5000, maximum_steps=10000, render=False,\ seed=543, log_interval=1, entropy_coeff=0.0, clip_epsilon=0.2): env = gym.make(env_name) #Get number of inputs for A3CActor num_inputs = env.observation_space.shape[0] #Get number of outputs required for describing action num_actions = env.action_space.shape[0] env.seed(seed) torch.manual_seed(seed) actor_net = A3CActor(num_inputs, num_actions) actor_optimizer = optim.Adam(actor_net.parameters(), lr=0.001) running_state = ZFilter((num_inputs, ), clip=5) running_reward = ZFilter((1, ), demean=False, clip=10) episode_lengths = [] for i_episode in range(number_of_batches): memory = Memory() num_steps = 0 reward_batch = 0 num_episodes = 0 while num_steps < batch_size: state = env.reset() state = running_state(state) reward_sum = 0 for t in range(maximum_steps): action = select_action(state, actor_net) action = action.data[0].numpy() next_state, reward, done, _ = env.step(action) reward_sum += reward next_state = running_state(next_state) mask = 1 if done: mask = 0 memory.push(state, np.array([action]), mask, next_state, reward) if render: env.render() if done: break state = next_state num_steps += (t - 1) num_episodes += 1 reward_batch += reward_sum reward_batch /= num_episodes batch = memory.sample() update_params(batch, actor_net, actor_optimizer, gamma, tau, clip_epsilon) if i_episode % log_interval == 0: print('Episode {}\t Last reward: {}\tAverage reward {:.2f}'.format( i_episode, reward_sum, reward_batch)) return
policy_net = Observations_Encoder(rows, cols, 3, z_dim, num_goals, num_actions, threshold, device=device).to(device) policy_net.train() #### Initializing Environment env = env_BP_w_display(num_goals) #### this is a low pass filter which improves training running_state = ZFilter(((3, rows, cols)), clip=5) running_reward = ZFilter((1, ), demean=False, clip=10) if model_path != "": print("MODEL LOADED") ckpt = torch.load(model_path) policy_net.load_state_dict(ckpt['policy_net']) value_net.load_state_dict(ckpt['value_net']) running_state.rs._M = ckpt['running_M'] running_state.rs._S = ckpt['running_S'] running_state.rs._n = ckpt['running_n'] else: print("NO RL MODEL LOADED") ##### Logging Folders #####
def test(): env_id="MotorEnv-v0" env = gym.make(env_id) #创造环境 num_inputs = env.observation_space.shape[0] net = torch.load('./models/model6.pkl') print(net) with torch.no_grad(): running_state = ZFilter((num_inputs,), clip=5) episodes =[] eval_rewards =[] eval_done = [] eval_states = [] eval_input = [] eval_delta = [] eval_L = [] state = env.reset() state = running_state(state) render = False for t in range(int(50 / env.steps)): # action = env.action_space.sample() #随机采样动作 # observation, reward, done, info = env.step(1) #与环境交互,获得下一步的时刻 action = select_action(state, net) #action action = action.data[0].numpy() observation, reward, done, info = env.step(action) #与环境交互,获得下一步的时刻 state = running_state(observation) C = np.array([0, 1, 0, 0]) L = np.dot(C, observation.reshape(4,1)) # if done: # break # pass # env.render() #绘制场景 # count+=1 # time.sleep(0.001) #每次等待0.2s # print(info['input'],env.state, env.counts) # print(env.counts) episodes.append(env.counts) eval_states.append(observation) eval_rewards.append(reward) eval_done.append(done) eval_L.append(L) eval_input.append(info['input']) eval_delta.append(info['delta']) episodes = np.array(episodes) eval_rewards = np.array(eval_rewards) eval_states = np.array(eval_states) eval_done = np.array(eval_done) eval_input = np.array(eval_input) eval_delta = np.array(eval_delta) fig = plt.figure("VibrationEnv-states") plt.plot(episodes, eval_L) plt.title("%s"%env_id) plt.xlabel("Episode") plt.ylabel("eval_states") plt.legend(["x","y","p","q"]) plt.grid() plt.show() fig = plt.figure("VibrationEnv-u") plt.plot(episodes, eval_input) plt.title("%s"%env_id) plt.xlabel("Episode") plt.ylabel("eval_states") plt.legend(["u"]) plt.grid() plt.show() env.close()
value_net = Value(num_inputs) summary_writer = tensorboardX.SummaryWriter(log_dir) def select_action(state, deterministic=False): state = torch.from_numpy(state).unsqueeze(0) action_mean, _, action_std = policy_net(Variable(state)) if not deterministic: action = torch.normal(action_mean, action_std) else: action = action_mean # action is mode return action running_state = ZFilter((num_inputs, ), clip=5) ckpt = torch.load(args.checkpoint) policy_net.load_state_dict(ckpt['policy_net']) value_net.load_state_dict(ckpt['value_net']) running_state.rs._M = ckpt['running_M'] running_state.rs._S = ckpt['running_S'] running_state.rs._n = ckpt['running_n'] eval_hole = [] eval_rewards = [] eval_completed = [] eval_touched = [] for eval_episode in range(args.eval_eps):
def test(rank, args, shared_model, opt_ac): best_result = -1000 torch.manual_seed(args.seed + rank) torch.set_default_tensor_type('torch.DoubleTensor') num_inputs = args.feature num_actions = 9 last_state = numpy.zeros(41) if args.render: env = RunEnv(visualize=True) else: env = RunEnv(visualize=False) running_state = ZFilter((num_inputs, ), clip=5) running_reward = ZFilter((1, ), demean=False, clip=10) episode_lengths = [] PATH_TO_MODEL = '../models/' + str(args.bh) ac_net = ActorCritic(num_inputs, num_actions) start_time = time.time() for i_episode in count(1): memory = Memory() ac_net.load_state_dict(shared_model.state_dict()) num_steps = 0 reward_batch = 0 num_episodes = 0 while num_steps < args.batch_size: #state = env.reset() #print(num_steps) state = env.reset(difficulty=0) state = numpy.array(state) #global last_state #last_state = state #last_state,_ = update_observation(last_state,state) #last_state,state = update_observation(last_state,state) #print(state.shape[0]) #print(state[41]) state = running_state(state) reward_sum = 0 for t in range(10000): # Don't infinite loop while learning #print(t) #timer = time.time() if args.use_sep_pol_val: action = select_action(state) else: action = select_action_actor_critic(state, ac_net) #print(action) action = action.data[0].numpy() if numpy.any(numpy.isnan(action)): print(action) puts('ERROR') return #print('NN take:') #print(time.time()-timer) #print(action) #print("------------------------") #timer = time.time() if args.skip: #env.step(action) _, reward, _, _ = env.step(action) reward_sum += reward next_state, reward, done, _ = env.step(action) next_state = numpy.array(next_state) reward_sum += reward #print('env take:') #print(time.time()-timer) #timer = time.time() #last_state ,next_state = update_observation(last_state,next_state) next_state = running_state(next_state) #print(next_state[41:82]) mask = 1 if done: mask = 0 #print('update take:') #print(time.time()-timer) #timer = time.time() memory.push(state, np.array([action]), mask, next_state, reward) #print('memory take:') #print(time.time()-timer) #if args.render: # env.render() if done: break state = next_state num_steps += (t - 1) num_episodes += 1 #print(num_episodes) reward_batch += reward_sum #print(num_episodes) reward_batch /= num_episodes batch = memory.sample() #update_params_actor_critic(batch,args,shared_model,ac_net,opt_ac) time.sleep(60) if i_episode % args.log_interval == 0: File = open(PATH_TO_MODEL + '/record.txt', 'a+') File.write("Time {}, episode reward {}, Average reward {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, reward_batch)) File.close() #print('TestEpisode {}\tLast reward: {}\tAverage reward {:.2f}'.format( # i_episode, reward_sum, reward_batch)) print("Time {}, episode reward {}, Average reward {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, reward_batch)) #print('!!!!') epoch = i_episode if reward_batch > best_result: best_result = reward_batch save_model( { 'epoch': epoch, 'bh': args.bh, 'state_dict': shared_model.state_dict(), 'optimizer': opt_ac.state_dict(), }, PATH_TO_MODEL, 'best') if epoch % 30 == 1: save_model( { 'epoch': epoch, 'bh': args.bh, 'state_dict': shared_model.state_dict(), 'optimizer': opt_ac.state_dict(), }, PATH_TO_MODEL, epoch)
def main(gamma=0.995, env_name="Walker2d-v2", tau=0.97, number_of_batches=500,\ batch_size=5000, maximum_steps=10000, render=False,\ seed=543, log_interval=1, entropy_coeff=0.0, clip_epsilon=0.2): env = gym.make(env_name) #Get number of inputs for A3CActor num_inputs = env.observation_space.shape[0] #Get number of outputs required for describing action num_actions = env.action_space.shape[0] env.seed(seed) torch.manual_seed(seed) actor_net = A3CActor(num_inputs, num_actions) actor_optimizer = optim.Adam(actor_net.parameters(), lr=0.001) running_state = ZFilter((num_inputs,), clip=5) running_reward = ZFilter((1, ), demean=False, clip=10) episode_lengths = [] plot_rew = [] for i_episode in range(number_of_batches): memory = Memory() num_steps = 0 reward_batch = 0 num_episodes = 0 while num_steps < batch_size: state = env.reset() state = running_state(state) reward_sum = 0 for t in range(maximum_steps): action = select_action(state, actor_net) action = action.data[0].numpy() next_state, reward, done, _ = env.step(action) reward_sum += reward next_state = running_state(next_state) mask = 1 if done: mask = 0 memory.push(state, np.array([action]), mask, next_state, reward) if render: env.render() if done: break state = next_state num_steps += (t-1) num_episodes += 1 reward_batch += reward_sum reward_batch /= num_episodes batch = memory.sample() plot_rew.append(reward_batch) update_params(batch, actor_net, actor_optimizer, gamma, tau, clip_epsilon) if i_episode % log_interval == 0: print('Episode {}\t Last reward: {}\tAverage reward {:.2f}'.format( i_episode, reward_sum, reward_batch)) plot_epi = [] for i in range (number_of_batches): plot_epi.append(i) trace = go.Scatter( x = plot_epi, y = plot_rew) layout = go.Layout(title='A2C',xaxis=dict(title='Episodes', titlefont=dict(family='Courier New, monospace',size=18,color='#7f7f7f')), yaxis=dict(title='Average Reward', titlefont=dict(family='Courier New, monospace',size=18,color='#7f7f7f'))) plotly.offline.plot({"data": [trace], "layout": layout},filename='PPO.html',image='jpeg') return