def global_model_eval(global_model, global_count): temp_model = DDPG(obs_dim=obs_dim, act_dim=act_dim, critic_dist_info=critic_dist_info) env = NormalizeAction(gym.make(args.env).env) env._max_episode_steps = 500 while True: counter = to_numpy(global_count)[0] if counter >= 1000000: break temp_model.actor.load_state_dict(global_model.actor.state_dict()) temp_model.critic.load_state_dict(global_model.critic.state_dict()) temp_model.actor.eval() # bp() global global_returns state = env.reset() curr_return = 0 step_count = 0 while True: action = temp_model.actor(to_tensor(state.reshape((1, -1)))) next_state, reward, done, _ = env.step( to_numpy(action).reshape(-1)) curr_return += reward step_count += 1 # print("Step count: ", step_count) if done or step_count > args.max_steps: break else: state = next_state global_returns.append( (counter, 0.95 * global_returns[-1][1] + 0.05 * curr_return, curr_return)) print("Global Steps: ", counter, "Global return: ", global_returns[-1][1], "Current return: ", curr_return) time.sleep(10)
class Worker(object): def __init__(self, name, optimizer_global_actor, optimizer_global_critic): self.env = NormalizeAction(gym.make(args.env).env) self.env._max_episode_steps = args.max_steps self.name = name self.ddpg = DDPG(obs_dim=obs_dim, act_dim=act_dim, env=self.env, memory_size=args.rmsize,\ batch_size=args.bsize, tau=args.tau, gamma = args.gamma, n_steps = args.n_steps) self.ddpg.assign_global_optimizer(optimizer_global_actor, optimizer_global_critic) print('Intialized worker :', self.name) def warmup(self): n_steps = 0 self.ddpg.actor.eval() # for i in range(args.n_eps): # state = self.env.reset() # for j in range(args.max_steps): # state = self.env.reset() for n_steps in range(args.warmup): action = np.random.uniform(-1.0, 1.0, size=act_dim) next_state, reward, done, _ = self.env.step(action) self.ddpg.replayBuffer.append(state, action, reward, done) if done: state = self.env.reset() else: state = next_state def work(self, global_ddpg): avg_reward = 0. n_steps = 0 #self.warmup() self.ddpg.sync_local_global(global_ddpg) self.ddpg.hard_update() # Logging variables self.start_time = datetime.datetime.utcnow() self.train_logs = {} self.train_logs['avg_reward'] = [] self.train_logs['total_reward'] = [] self.train_logs['time'] = [] self.train_logs['x_val'] = [] self.train_logs['info_summary'] = "DDPG" self.train_logs['x'] = 'steps' step_counter = 0 for i in range(args.n_eps): state = self.env.reset() total_reward = 0. episode_states = [] episode_rewards = [] episode_actions = [] for j in range(args.max_steps): self.ddpg.actor.eval() state = state.reshape(1, -1) noise = self.ddpg.noise.sample() action = np.clip( to_numpy(self.ddpg.actor(to_tensor(state))).reshape(-1, ) + noise, -1.0, 1.0) # action = to_numpy(self.ddpg.actor(to_tensor(state))).reshape(-1, ) + noise next_state, reward, done, _ = self.env.step(action) total_reward += reward #### n-steps buffer episode_states.append(state) episode_actions.append(action) episode_rewards.append(reward) if j >= args.n_steps - 1: cum_reward = 0. exp_gamma = 1 for k in range(-args.n_steps, 0): cum_reward += exp_gamma * episode_rewards[k] exp_gamma *= args.gamma self.ddpg.replayBuffer.add( episode_states[-args.n_steps].reshape(-1), episode_actions[-args.n_steps], cum_reward, next_state, done) # self.ddpg.replayBuffer.add_experience(state.reshape(-1), action, reward, next_state, done) #self.ddpg.replayBuffer.append(state.reshape(-1), action, reward, done) self.ddpg.actor.train() self.ddpg.train(global_ddpg) step_counter += 1 n_steps += 1 if done: break state = next_state # print("Episode ", i, "\t Step count: ", n_steps) self.ddpg.noise.reset() avg_reward = 0.95 * avg_reward + 0.05 * total_reward if i % 1 == 0: print('Episode ', i, '\tWorker :', self.name, '\tAvg Reward :', avg_reward, '\tTotal reward :', total_reward, '\tSteps :', n_steps) self.train_logs['avg_reward'].append(avg_reward) self.train_logs['total_reward'].append(total_reward) self.train_logs['time'].append( (datetime.datetime.utcnow() - self.start_time).total_seconds() / 60) self.train_logs['x_val'].append(step_counter) with open(args.logfile, 'wb') as fHandle: pickle.dump(self.train_logs, fHandle, protocol=pickle.HIGHEST_PROTOCOL) with open(args.logfile_latest, 'wb') as fHandle: pickle.dump(self.train_logs, fHandle, protocol=pickle.HIGHEST_PROTOCOL)
class Worker(object): def __init__(self, name, optimizer_global_actor, optimizer_global_critic): self.env = NormalizeAction(gym.make(args.env).env) self.env._max_episode_steps = args.max_steps self.name = name self.ddpg = DDPG(obs_dim=obs_dim, act_dim=act_dim, env=self.env, memory_size=args.rmsize,\ batch_size=args.bsize, tau=args.tau, critic_dist_info=critic_dist_info, \ prioritized_replay=args.p_replay, gamma = args.gamma, n_steps = args.n_steps) self.ddpg.assign_global_optimizer(optimizer_global_actor, optimizer_global_critic) print('Intialized worker :', self.name) # warmup function to fill replay buffer initially def warmup(self): self.ddpg.actor.eval() # bp() for i in range(5000 // args.max_steps): addExperienceToBuffer(self.ddpg, self.ddpg.replayBuffer, self.env, her=args.her, her_ratio=0.8) # bp() return counter = 0 state = self.env.reset() episode_states = [] episode_rewards = [] episode_actions = [] while counter < args.warmup: action = to_numpy(self.ddpg.actor(to_tensor(state.reshape( -1)))) #np.random.uniform(-1.0, 1.0, size=act_dim) next_state, reward, done, _ = self.env.step( np.clip(action + self.ddpg.noise.sample(), -1, 1)) #### n-steps buffer episode_states.append(state) episode_actions.append(action) episode_rewards.append(reward) if len(episode_states) >= args.n_steps: cum_reward = 0. exp_gamma = 1 for k in range(-args.n_steps, 0): try: cum_reward += exp_gamma * episode_rewards[k] except: bp() exp_gamma *= args.gamma self.ddpg.replayBuffer.add( episode_states[-args.n_steps].reshape(-1), episode_actions[-1], cum_reward, next_state, done) if done: episode_states = [] episode_rewards = [] episode_actions = [] state = self.env.reset() else: state = next_state counter += 1 def work(self, global_ddpg, global_count): avg_reward_train = 0. avg_reward_test = 0. self.ddpg.sync_local_global(global_ddpg) self.ddpg.hard_update() self.start_time = datetime.datetime.utcnow() self.warmup() # Logging variables self.train_logs = {} self.train_logs['avg_reward_train'] = [] self.train_logs['avg_reward_test'] = [] self.train_logs['total_reward_train'] = [] self.train_logs['total_reward_test'] = [] self.train_logs['time'] = [] self.train_logs['x_val'] = [] self.train_logs['info_summary'] = "Distributional DDPG_" + str( args.n_steps) + 'N' if args.p_replay: self.train_logs[ 'info_summary'] = self.train_logs['info_summary'] + ' + PER' self.train_logs['x'] = 'steps' step_counter = 0 # state = self.env.reset() # total_reward_train = 0. # episode_states = [] # episode_rewards = [] # episode_actions = [] # # for j in range(args.max_steps): # self.ddpg.actor.eval() # # state = state.reshape(1, -1) # noise = self.ddpg.noise.sample() # action = np.clip(to_numpy(self.ddpg.actor(to_tensor(state))).reshape(-1, ) + noise, -1.0, 1.0) # next_state, reward, done, _ = self.env.step(action) # total_reward_train += reward # # #### n-steps buffer # episode_states.append(state) # episode_actions.append(action) # episode_rewards.append(reward) # # # if j >= args.n_steps-1: # cum_reward = 0. # exp_gamma = 1 # for k in range(-args.n_steps, 0): # cum_reward += exp_gamma * episode_rewards[k] # exp_gamma *= args.gamma # self.ddpg.replayBuffer.add(episode_states[-args.n_steps].reshape(-1), episode_actions[-args.n_steps], cum_reward, next_state, done) # # # self.ddpg.replayBuffer.add(state.reshape(-1), action, reward, next_state, done) # for i in range(args.n_eps): for cycle in range(50): for episode_count in range(16): addExperienceToBuffer(self.ddpg, self.ddpg.replayBuffer, self.env, her=args.her, her_ratio=0.8) for j in range(40): self.ddpg.actor.train() self.ddpg.train(global_ddpg) step_counter += 1 global_count += 1 success = 0 success_steps = [] nTrials = 10 for k in range(nTrials): total_reward_test = 0. episode_rewards = [] episode_states = [] episode_success = [] state = self.env.reset() cc = 0 for j in range(args.max_steps): cc += 1 self.ddpg.actor.eval() state = np.concatenate( (state['observation'], state['desired_goal'])) state = state.reshape(1, -1) action = to_numpy(self.ddpg.actor( to_tensor(state))).reshape(-1) action = np.clip(action, -1.0, 1.0) next_state, reward, done, info = self.env.step(action) done = bool(info['is_success']) total_reward_test += reward episode_rewards.append((j, reward)) episode_states.append((j, state)) episode_success.append((j, info['is_success'])) #if reward == 0 and j != 49: # bp() if done: success += 1 success_steps.append(j) break else: state = next_state #if total_reward_test > -50: # print("Reward: ", total_reward_test, "\t Done: ", done, "\t success: ", success) # print("Episode rewards \n", episode_rewards, "\n") # print("Episode rewards \n", episode_states, "\n") #bp() avg_reward_test = 0.95 * avg_reward_test + 0.05 * total_reward_test success_rate = float(success) / nTrials print("Epoch: ", i, "\t Cycle: ", cycle, "\t ", '\tAvg Reward Test:', avg_reward_test, '\tTest success steps :', success_steps, '\t Success Rate', success_rate, '\tSteps :', step_counter) # writer.add_scalar('train_reward', total_reward_train, n_steps) writer.add_scalar('avg_test_reward', avg_reward_test, step_counter) writer.add_scalar('success_rate', success_rate, step_counter) # self.train_logs['avg_reward_train'].append(avg_reward_train) # self.train_logs['avg_reward_test'].append(avg_reward_test) # # self.train_logs['total_reward_train'].append(total_reward_train) # self.train_logs['total_reward_test'].append(total_reward_test) # self.train_logs['time'].append((datetime.datetime.utcnow()-self.start_time).total_seconds()/60) # self.train_logs['x_val'].append(step_counter) # with open(args.logfile, 'wb') as fHandle: # pickle.dump(self.train_logs, fHandle, protocol=pickle.HIGHEST_PROTOCOL) # with open(args.logfile_latest, 'wb') as fHandle: # pickle.dump(self.train_logs, fHandle, protocol=pickle.HIGHEST_PROTOCOL) # self.ddpg.noise.reset() torch.save(self.ddpg.actor.state_dict(), path + '/actor.pth') torch.save(self.ddpg.critic.state_dict(), path + '/critic.pth')