def collect_samples(pid, queue, env, policy, stochastic, render, running_state, min_batch_size): torch.randn(pid) log = dict() memory = Memory() num_steps = 0 total_reward = 0 min_reward = 1e6 max_reward = -1e6 num_episodes = 0 while num_steps < min_batch_size: state = env.reset() if running_state is not None: state = running_state(state) reward_episode = 0 for t in range(10000): state_var = tensor(state).unsqueeze(0) with torch.no_grad(): if not stochastic: action = policy(state_var)[0][0].numpy() else: action = policy.select_action(state_var)[0].numpy() action = action.astype(np.float64) next_state, reward, done, _ = env.step(action) reward_episode += reward if running_state is not None: next_state = running_state(next_state) mask = 0 if done else 1 memory.push(state, action, mask, next_state, reward) if render: env.render() if done: break state = next_state # log stats num_steps += (t + 1) num_episodes += 1 total_reward += reward_episode min_reward = min(min_reward, reward_episode) max_reward = max(max_reward, reward_episode) log['num_steps'] = num_steps log['num_episodes'] = num_episodes log['total_reward'] = total_reward log['avg_reward'] = total_reward / num_episodes log['max_reward'] = max_reward log['min_reward'] = min_reward if queue is not None: queue.put([pid, memory, log]) else: return memory, log
def collect_samples(pid, queue, env, policy, custom_reward, mean_action, render, running_state, min_batch_size): if pid > 0: torch.manual_seed(torch.randint(0, 5000, (1, )) * pid) if hasattr(env, 'np_random'): env.np_random.seed(env.np_random.randint(5000) * pid) if hasattr(env, 'env') and hasattr(env.env, 'np_random'): env.env.np_random.seed(env.env.np_random.randint(5000) * pid) log = dict() memory = Memory() num_steps = 0 total_reward = 0 min_reward = 1e6 max_reward = -1e6 total_c_reward = 0 min_c_reward = 1e6 max_c_reward = -1e6 num_episodes = 0 while num_steps < min_batch_size: state = env.reset() if running_state is not None: state = running_state(state) reward_episode = 0 for t in range(10000): state_var = tensor(state).unsqueeze(0) with torch.no_grad(): if mean_action: action = policy(state_var)[0][0].numpy() else: action = policy.select_action(state_var)[0].numpy() action = int(action) if policy.is_disc_action else action.astype( np.float64) next_state, reward, done, _ = env.step(action) reward_episode += reward if running_state is not None: next_state = running_state(next_state) if custom_reward is not None: reward = custom_reward(state, action) total_c_reward += reward min_c_reward = min(min_c_reward, reward) max_c_reward = max(max_c_reward, reward) mask = 0 if done else 1 memory.push(state, action, mask, next_state, reward) if render: env.render() if done: break state = next_state # log stats num_steps += (t + 1) num_episodes += 1 total_reward += reward_episode min_reward = min(min_reward, reward_episode) max_reward = max(max_reward, reward_episode) log['num_steps'] = num_steps log['num_episodes'] = num_episodes log['total_reward'] = total_reward log['avg_reward'] = total_reward / num_episodes log['max_reward'] = max_reward log['min_reward'] = min_reward if custom_reward is not None: log['total_c_reward'] = total_c_reward log['avg_c_reward'] = total_c_reward / num_steps log['max_c_reward'] = max_c_reward log['min_c_reward'] = min_c_reward if queue is not None: queue.put([pid, memory, log]) else: return memory, log
def train_gail(self, expert): '''Train Info-GAIL.''' args, dtype = self.args, self.dtype results = { 'average_reward': [], 'episode_reward': [], 'true_traj': {}, 'pred_traj': {} } self.train_step_count, self.gail_step_count = 0, 0 for ep_idx in range(args.num_epochs): memory = Memory() num_steps = 0 reward_batch, true_reward_batch = [], [] expert_true_reward_batch = [] true_traj_curr_episode, gen_traj_curr_episode = [], [] while num_steps < args.batch_size: traj_expert = expert.sample(size=1) state_expert, action_expert, _, _ = traj_expert # Expert state and actions state_expert = state_expert[0] action_expert = action_expert[0] expert_episode_len = len(state_expert) # Sample start state or should we just choose the start state # from the expert trajectory sampled above. # curr_state_obj = self.sample_start_state() curr_state_obj = State(state_expert[0], self.obstacles) curr_state_feat = self.get_state_features( curr_state_obj, self.args.use_state_features) # Add history to state if args.history_size > 1: curr_state = -1 * np.ones( (args.history_size * curr_state_feat.shape[0]), dtype=np.float32) curr_state[(args.history_size-1) \ * curr_state_feat.shape[0]:] = curr_state_feat else: curr_state = curr_state_feat # TODO: Make this a separate function. Can be parallelized. ep_reward, ep_true_reward, expert_true_reward = 0, 0, 0 true_traj, gen_traj = [], [] gen_traj_dict = { 'features': [], 'actions': [], 'c': [], 'mask': [] } disc_reward, posterior_reward = 0.0, 0.0 # Use a hard-coded list for memory to gather experience since we # need to mutate it before finally creating a memory object. c_sampled = np.zeros((self.num_goals), dtype=np.float32) c_sampled[np.random.randint(0, self.num_goals)] = 1.0 c_sampled_tensor = torch.zeros((1)).type(torch.LongTensor) c_sampled_tensor[0] = int(np.argmax(c_sampled)) if self.args.cuda: c_sampled_tensor = torch.cuda.LongTensor(c_sampled_tensor) memory_list = [] for t in range(expert_episode_len): action = self.select_action( np.concatenate((curr_state, c_sampled))) action_numpy = action.data.cpu().numpy() # Save generated and true trajectories true_traj.append((state_expert[t], action_expert[t])) gen_traj.append((curr_state_obj.coordinates, action_numpy)) gen_traj_dict['features'].append( self.get_state_features(curr_state_obj, self.args.use_state_features)) gen_traj_dict['actions'].append(action_numpy) gen_traj_dict['c'].append(c_sampled) action = epsilon_greedy_linear_decay(action_numpy, args.num_epochs * 0.5, ep_idx, self.action_size, low=0.05, high=0.3) # Get the discriminator reward disc_reward_t = float( self.reward_net( torch.cat((Variable( torch.from_numpy(curr_state).unsqueeze( 0)).type(dtype), Variable( torch.from_numpy( oned_to_onehot( action, self.action_size)). unsqueeze(0)).type(dtype)), 1)).data.cpu().numpy()[0, 0]) if args.use_log_rewards and disc_reward_t < 1e-6: disc_reward_t += 1e-6 disc_reward_t = -math.log(disc_reward_t) \ if args.use_log_rewards else -disc_reward_t disc_reward += disc_reward_t # Predict c given (x_t) predicted_posterior = self.posterior_net( Variable(torch.from_numpy(curr_state).unsqueeze( 0)).type(dtype)) posterior_reward_t = self.criterion_posterior( predicted_posterior, Variable(c_sampled_tensor)).data.cpu().numpy()[0] posterior_reward += (self.args.lambda_posterior * posterior_reward_t) # Update Rewards ep_reward += (disc_reward_t + posterior_reward_t) true_goal_state = [ int(x) for x in state_expert[-1].tolist() ] if self.args.flag_true_reward == 'grid_reward': ep_true_reward += self.true_reward.reward_at_location( curr_state_obj.coordinates, goals=[true_goal_state]) expert_true_reward += self.true_reward.reward_at_location( state_expert[t], goals=[true_goal_state]) elif self.args.flag_true_reward == 'action_reward': ep_true_reward += self.true_reward.reward_at_location( np.argmax(action_expert[t]), action) expert_true_reward += self.true_reward.corret_action_reward else: raise ValueError("Incorrect true reward type") # Update next state next_state_obj = self.transition_func( curr_state_obj, Action(action), 0) next_state_feat = self.get_state_features( next_state_obj, self.args.use_state_features) #next_state = running_state(next_state) mask = 0 if t == expert_episode_len - 1 else 1 # Push to memory memory_list.append([ curr_state, np.array([oned_to_onehot(action, self.action_size)]), mask, next_state_feat, disc_reward_t + posterior_reward_t, c_sampled, c_sampled ]) if args.render: env.render() if not mask: break curr_state_obj = next_state_obj curr_state_feat = next_state_feat if args.history_size > 1: curr_state[:(args.history_size-1) \ * curr_state_feat.shape[0]] = \ curr_state[curr_state_feat.shape[0]:] curr_state[(args.history_size-1) \ * curr_state_feat.shape[0]:] = curr_state_feat else: curr_state = curr_state_feat assert memory_list[-1][2] == 0, \ "Mask for final end state is not 0." for memory_t in memory_list: memory.push(*memory_t) self.logger.summary_writer.add_scalars( 'gen_traj/gen_reward', { 'discriminator': disc_reward, 'posterior': posterior_reward, }, self.train_step_count) num_steps += (t - 1) reward_batch.append(ep_reward) true_reward_batch.append(ep_true_reward) expert_true_reward_batch.append(expert_true_reward) results['episode_reward'].append(ep_reward) # Append trajectories true_traj_curr_episode.append(true_traj) gen_traj_curr_episode.append(gen_traj) results['average_reward'].append(np.mean(reward_batch)) # Add to tensorboard self.logger.summary_writer.add_scalars( 'gen_traj/reward', { 'average': np.mean(reward_batch), 'max': np.max(reward_batch), 'min': np.min(reward_batch) }, self.train_step_count) self.logger.summary_writer.add_scalars( 'gen_traj/true_reward', { 'average': np.mean(true_reward_batch), 'max': np.max(true_reward_batch), 'min': np.min(true_reward_batch), 'expert_true': np.mean(expert_true_reward_batch) }, self.train_step_count) # Add predicted and generated trajectories to results if ep_idx % self.args.save_interval == 0: results['true_traj'][ep_idx] = copy.deepcopy( true_traj_curr_episode) results['pred_traj'][ep_idx] = copy.deepcopy( gen_traj_curr_episode) # Update parameters gen_batch = memory.sample() # We do not get the context variable from expert trajectories. # Hence we need to fill it in later. expert_batch = expert.sample(size=args.num_expert_trajs) self.update_params(gen_batch, expert_batch, ep_idx, args.optim_epochs, args.optim_batch_size) self.train_step_count += 1 if ep_idx > 0 and ep_idx % args.log_interval == 0: print('Episode [{}/{}] Avg R: {:.2f} Max R: {:.2f} \t' \ 'True Avg {:.2f} True Max R: {:.2f} ' \ 'Expert (Avg): {:.2f}'.format( ep_idx, args.num_epochs, np.mean(reward_batch), np.max(reward_batch), np.mean(true_reward_batch), np.max(true_reward_batch), np.mean(expert_true_reward_batch))) results_path = os.path.join(args.results_dir, 'results.pkl') with open(results_path, 'wb') as results_f: pickle.dump((results), results_f, protocol=2) # print("Did save results to {}".format(results_path)) if ep_idx % args.save_interval == 0: checkpoint_filepath = self.model_checkpoint_filepath(ep_idx) torch.save(self.checkpoint_data_to_save(), checkpoint_filepath) print("Did save checkpoint: {}".format(checkpoint_filepath))
def collect_samples(pid, queue, env, policy, custom_reward, mean_action, render, running_state, min_batch_size): torch.randn(pid) log = dict() memory = Memory() num_steps = 0 total_reward = 0 min_reward = 1e6 max_reward = -1e6 total_c_reward = 0 min_c_reward = 1e6 max_c_reward = -1e6 num_episodes = 0 while num_steps < min_batch_size: state = env.reset() if running_state is not None: state = running_state(state) reward_episode = 0 repeat = 0 repeat_len = 0 last_action = None ready_to_push = False reward_period = 0 interval = 0 for t in range(10000): state_var = tensor(state).unsqueeze(0) # Learning to Repeate only predicts when repeat reduced to 0. assert (repeat >= 0) if repeat <= 0: with torch.no_grad(): if mean_action: action = policy(state_var)[0][0].numpy() else: # action, repeat = policy.select_action(state_var)[0].numpy() action, repeat = policy.select_action(state_var) action = action[0].numpy() repeat = repeat[0].numpy() # print(action) # print(repeat) # exit() action = int( action) if policy.is_disc_action else action.astype( np.float64) # action = action.tolist() last_action = action repeat = int(repeat) repeat_len = repeat ready_to_push = True reward_period = 0 interval = 0 next_state, reward, done, _ = env.step(last_action) reward_episode += reward # reward_period += reward reward_period += reward * (args.gamma**(repeat_len - repeat)) interval += 1 if repeat > 0: repeat -= 1 if running_state is not None: next_state = running_state(next_state) if custom_reward is not None: reward = custom_reward(state, action) total_c_reward += reward min_c_reward = min(min_c_reward, reward) max_c_reward = max(max_c_reward, reward) mask = 0 if done else 1 if ready_to_push == True or done: repeat = 0 if done else repeat memory.push(state, last_action, mask, next_state, reward_period, repeat) ready_to_push = False interval = 0 if render: env.render() if done: # print(reward_episode) # print(t) break state = next_state # log stats num_steps += (t + 1) num_episodes += 1 total_reward += reward_episode min_reward = min(min_reward, reward_episode) max_reward = max(max_reward, reward_episode) log['num_steps'] = num_steps log['num_episodes'] = num_episodes log['total_reward'] = total_reward log['avg_reward'] = total_reward / num_episodes log['max_reward'] = max_reward log['min_reward'] = min_reward if custom_reward is not None: log['total_c_reward'] = total_c_reward log['avg_c_reward'] = total_c_reward / num_steps log['max_c_reward'] = max_c_reward log['min_c_reward'] = min_c_reward if queue is not None: queue.put([pid, memory, log]) else: return memory, log
def collect_samples(pid, queue, env, policy, custom_reward, mean_action, tensor, render, running_state, update_rs, min_batch_size): torch.randn(pid, ) log = dict() memory = Memory() num_steps = 0 total_reward = 0 min_reward = 1e6 max_reward = -1e6 total_c_reward = 0 min_c_reward = 1e6 max_c_reward = -1e6 num_episodes = 0 while num_steps < min_batch_size: state = env.reset() if running_state is not None: state = running_state(state, update=update_rs) reward_episode = 0 for t in range(10000): state_var = Variable(tensor(state).unsqueeze(0), volatile=True) if mean_action: action = policy(state_var)[0].data[0].numpy() else: action = policy.select_action(state_var)[0].numpy() action = int(action) if policy.is_disc_action else action.astype(np.float64) next_state, reward, done, _ = env.step(action) reward_episode += reward if running_state is not None: next_state = running_state(next_state, update=update_rs) if custom_reward is not None: reward = custom_reward(state, action) total_c_reward += reward min_c_reward = min(min_c_reward, reward) max_c_reward = max(max_c_reward, reward) mask = 0 if done else 1 memory.push(state, action, mask, next_state, reward) if render: env.render() if done: break state = next_state # log stats num_steps += (t + 1) num_episodes += 1 total_reward += reward_episode min_reward = min(min_reward, reward_episode) max_reward = max(max_reward, reward_episode) log['num_steps'] = num_steps log['num_episodes'] = num_episodes log['total_reward'] = total_reward log['avg_reward'] = total_reward / num_episodes log['max_reward'] = max_reward log['min_reward'] = min_reward if custom_reward is not None: log['total_c_reward'] = total_c_reward log['avg_c_reward'] = total_c_reward / num_steps log['max_c_reward'] = max_c_reward log['min_c_reward'] = min_c_reward if queue is not None: queue.put([pid, memory, log]) else: return memory, log
def collect_samples(pid, queue, env, policy, custom_reward, mean_action, render, min_batch_size): try: if isinstance(env, str): if render: env = gym.make(env, render_mode='human') else: env = gym.make(env) torch.randn(pid) log = dict() memory = Memory() num_steps = 0 total_reward = 0 min_reward = 1e6 max_reward = -1e6 total_c_reward = 0 min_c_reward = 1e6 max_c_reward = -1e6 num_episodes = 0 while num_steps < min_batch_size: state = env.reset() reward_episode = 0 for t in range(600): state_var = tensor(state).unsqueeze(0) with torch.no_grad(): if mean_action: action = policy(state_var)[0][0].numpy() else: action = policy.select_action(state_var)[0].numpy() action = int(action) if policy.is_disc_action else action.astype(np.float64) next_state, reward, done, _ = env.step(action) reward_episode += reward if custom_reward is not None: reward = custom_reward(state, action) total_c_reward += reward min_c_reward = min(min_c_reward, reward) max_c_reward = max(max_c_reward, reward) mask = 0 if done else 1 memory.push(state, action, mask, next_state, reward) if render: env.render() if done: break state = next_state # log stats num_steps += (t + 1) num_episodes += 1 total_reward += reward_episode min_reward = min(min_reward, reward_episode) max_reward = max(max_reward, reward_episode) log['num_steps'] = num_steps log['num_episodes'] = num_episodes log['total_reward'] = total_reward log['avg_reward'] = total_reward / num_episodes log['max_reward'] = max_reward log['min_reward'] = min_reward if custom_reward is not None: log['total_c_reward'] = total_c_reward log['avg_c_reward'] = total_c_reward / num_steps log['max_c_reward'] = max_c_reward log['min_c_reward'] = min_c_reward if queue is not None: queue.put([pid, memory, log]) else: return memory, log except Exception as e: if queue is not None: queue.put([pid, memory, log]) else: raise e
def collect_samples(pid, queue, env, policy, custom_reward, mean_action, render, running_state, min_batch_size, num_agents=1): torch.randn(pid) log = dict() memory = Memory() num_steps = 0 total_reward = [0] * num_agents #0 min_reward = 1e6 max_reward = -1e6 total_c_reward = 0 min_c_reward = 1e6 max_c_reward = -1e6 num_episodes = 0 # render=False while num_steps < min_batch_size: state = env.reset() # print(state) if running_state is not None: state = running_state(state) reward_episode = [0] * num_agents # env.render() for t in range(150): # time.sleep(1) #TODO: temporarily for a single agent we make the state # artificially into a list. # state_var = [tensor(state).unsqueeze(0), tensor(state).unsqueeze(0)] state_var = [tensor(s).unsqueeze(0) for s in state] # print('STATE', len(state_var)) # state_var = state_var[:-1] # print(len(state_var)) # state_var = [tensor(state).unsqueeze(0)] # print(state_var) # state_var = tensor(state).unsqueeze(0) with torch.no_grad(): if mean_action: # print('mean', policy(state_var)) action = policy(state_var)[0][0].numpy() else: # print('else', policy.select_action(state_var)) # print(state_var) action = policy.select_action(state_var) # print(action[0][0].numpy()) # print(action) # action_var = torch.stack([a[0] for a inaction, dim=1) # action_var = torch.cat(action, dim=1)[0].numpy() # action = [a[0] for a in action] action = [a[0].numpy().tolist() for a in action] # print(action_var) # print(action) # action = policy.select_action(state_var)[0].numpy() # TODO: this is added so that the prey is automatically controlled # by arbitrary input. # action.append(np.array([0.0, 0.0, 0.0, 0.0, 0.0])) # print('ACT', action) # print(action) action_var = action # print(action_var) # print(action) # action = int(action) if policy.is_disc_action else action.astype(np.float64) # print(action) # action_var = action # action_var = [int(a) for a in action_var] if policy.is_disc_action else [a.astype(np.float64) for a in action_var] # print(action_var) # print('aa', action) # print('av', action_var) # next_state, reward, done, _ = env.step(action) # TODO: while we use an environment that doesn't accept multi-agent # action lists next_state, reward, done, _ = env.step(action_var) # print(reward) # reward = reward[:-1] # done = done[:-1] # reward_all = np.sum(reward) # for r in range(len(reward)): # reward[r] = reward_all # print(reward) # reward = [reward, reward] # reward_episode += reward for r in range(len(reward_episode)): reward_episode[r] += reward[r] # print(reward_episode) if running_state is not None: next_state = running_state(next_state) if custom_reward is not None: reward = custom_reward(state, action) total_c_reward += reward min_c_reward = min(min_c_reward, reward) max_c_reward = max(max_c_reward, reward) # mask = 0 if done else 1 mask = [float(d) for d in done] # print(mask) #TODO while we use an environment that doesn't accept multi-agent # action lists # print(state) # state = [s.tolist() for s in state] # print(state) memory.push(state, action, mask, next_state, reward) # memory.push(state, action, mask, next_state, reward) if render: env.render() time.sleep(0.1) if np.all(done): break state = next_state # log stats num_steps += (t + 1) num_episodes += 1 for r in range(len(reward_episode)): total_reward[r] += reward_episode[r] # total_reward += reward_episode min_reward = min(min_reward, np.min(reward_episode)) max_reward = max(max_reward, np.max(reward_episode)) # min_reward = 0.0 # max_reward = 0.0 log['num_steps'] = num_steps log['avg_steps'] = num_steps / num_episodes log['num_episodes'] = num_episodes log['total_reward'] = total_reward log['avg_reward'] = [t / num_episodes for t in total_reward] #total_reward / num_episodes log['max_reward'] = max_reward log['min_reward'] = min_reward if custom_reward is not None: log['total_c_reward'] = total_c_reward log['avg_c_reward'] = total_c_reward / num_steps log['max_c_reward'] = max_c_reward log['min_c_reward'] = min_c_reward if queue is not None: queue.put([pid, memory, log]) else: return memory, log
def collect_samples(pid, queue, env, policy, custom_reward, mean_action, render, running_state, min_batch_size, use_reparametrization=False): torch.randn(pid) log = dict() memory = Memory() num_steps = 0 total_reward = 0 min_reward = 1e6 max_reward = -1e6 total_c_reward = 0 min_c_reward = 1e6 max_c_reward = -1e6 num_episodes = 0 lenght_lists=[] while num_steps < min_batch_size: state = env.reset() if running_state is not None: state = running_state(state) reward_episode = 0 for t in range(250): state_var = tensor(state).unsqueeze(0) with torch.no_grad(): if mean_action: # action = policy(state_var)[0][0].numpy() action = policy.select_action_deterministic.numpy() else: action = policy.select_action_stochastic(state_var)[0].numpy() action = int(action) if policy.is_disc_action else action.astype(np.float64) next_state, reward, done, _ = env.step(np.clip(action*100,a_min=-100, a_max=100)) reward_episode += reward if running_state is not None: next_state = running_state(next_state) if custom_reward is not None: reward = custom_reward(state, action) total_c_reward += reward min_c_reward = min(min_c_reward, reward) max_c_reward = max(max_c_reward, reward) mask = 0 if done else 1 memory.push(state, action, mask, next_state, reward) if render: env.render() # / if (done) or ((t%249 ==0) and t>0): if (done): break state = next_state lenght_lists.append(t) # log stats num_steps += (t + 1) num_episodes += 1 total_reward += reward_episode min_reward = min(min_reward, reward_episode) max_reward = max(max_reward, reward_episode) log['num_steps'] = num_steps log['num_episodes'] = num_episodes log['total_reward'] = total_reward log['avg_reward'] = total_reward / num_episodes log['max_reward'] = max_reward log['min_reward'] = min_reward log['lenght_mean'] = np.mean(lenght_lists) log['lenght_min'] = np.min(lenght_lists) log['lenght_max'] = np.max(lenght_lists) log['lenght_std'] = np.std(lenght_lists) if custom_reward is not None: log['total_c_reward'] = total_c_reward log['avg_c_reward'] = total_c_reward / num_steps log['avg_c_reward_per_episode'] = total_c_reward / num_episodes log['max_c_reward'] = max_c_reward log['min_c_reward'] = min_c_reward if queue is not None: queue.put([pid, memory, log]) else: return memory, log
def collect_samples(pid, queue, env, policy, custom_reward, mean_action, tensor, render, running_state, update_rs, min_batch_size, mode_list, state_type, num_steps_per_mode): torch.randn(pid, ) log = dict() memory = Memory() num_steps = 0 total_reward = 0 min_reward = 1e6 max_reward = -1e6 total_c_reward = 0 min_c_reward = 1e6 max_c_reward = -1e6 num_episodes = 0 max_t = num_steps_per_mode * len(mode_list) - 1 while num_steps < min_batch_size: state = env.reset() if state_type == 'decayed_context': state = np.concatenate((state, np.array([1.0]), activity_map(mode_list[0]), activity_map(mode_list[min(1, len(mode_list)-1)])), axis=0) elif state_type == 'context': state = np.concatenate((state, activity_map(mode_list[0]), activity_map(mode_list[min(1, len(mode_list)-1)])), axis=0) if running_state is not None: state = running_state(state, update=update_rs) reward_episode = 0 for t in range(10000): curr_mode_id = t // num_steps_per_mode if t % num_steps_per_mode == 0: if hasattr(env.env, 'mode'): env.env.mode = mode_list[curr_mode_id] state_var = Variable(tensor(state).unsqueeze(0), volatile=True) if mean_action: action = policy(state_var)[0].data[0].numpy() else: action = policy.select_action(state_var)[0].numpy() action = int(action) if policy.is_disc_action else action.astype(np.float64) next_state, reward, done, _ = env.step(action) reward_episode += reward next_mode_id = min(t+1, max_t) // num_steps_per_mode if state_type == 'decayed_context': next_state = np.concatenate((next_state, np.array([1/((t % num_steps_per_mode) + 1)]), activity_map(mode_list[next_mode_id]), activity_map(mode_list[min(next_mode_id+1, len(mode_list)-1)])), axis=0) elif state_type == 'context': next_state = np.concatenate((next_state, activity_map(mode_list[next_mode_id]), activity_map(mode_list[min(next_mode_id+1, len(mode_list)-1)])), axis=0) if running_state is not None: next_state = running_state(next_state, update=update_rs) if custom_reward is not None: reward = custom_reward(state, action) total_c_reward += reward min_c_reward = min(min_c_reward, reward) max_c_reward = max(max_c_reward, reward) if t == num_steps_per_mode * len(mode_list) - 1: done = True mask = 0 if done else 1 memory.push(state, action, mask, next_state, reward) if render: env.render() if done: break state = next_state # log stats num_steps += (t + 1) num_episodes += 1 total_reward += reward_episode min_reward = min(min_reward, reward_episode) max_reward = max(max_reward, reward_episode) log['num_steps'] = num_steps log['num_episodes'] = num_episodes log['total_reward'] = total_reward log['avg_reward'] = total_reward / num_episodes log['max_reward'] = max_reward log['min_reward'] = min_reward if custom_reward is not None: log['total_c_reward'] = total_c_reward log['avg_c_reward'] = total_c_reward / num_steps log['max_c_reward'] = max_c_reward log['min_c_reward'] = min_c_reward if queue is not None: queue.put([pid, memory, log]) else: return memory, log
def collect_samples(pid, queue, env, policy, custom_reward, mean_action, render, running_state, min_batch_size): torch.randn(pid) log = dict() log['reward_list'] = list() memory = Memory() num_steps = 0 total_reward = 0 min_reward = 1e6 max_reward = -1e6 total_c_reward = 0 min_c_reward = 1e6 max_c_reward = -1e6 num_episodes = 0 # tbd, should match build main dtype dtype = torch.float64 while num_steps < min_batch_size: state = env.reset() if running_state is not None: state = running_state(state) reward_episode = 0 for t in range(10000): #print('t:{:.1f}\tnum_steps:{:.1f}\tmin_batch_size:{:.1f}'.format(t,num_steps,min_batch_size)) # tbd, add .to(dtype) state_var = tensor(state).to(dtype).unsqueeze(0) with torch.no_grad(): if mean_action: action = policy(state_var)[0][0].numpy() else: action = policy.select_action(state_var)[0].numpy() action = int(action) if policy.is_disc_action else action.astype( np.float64) next_state, reward, done, _ = env.step(action) reward_episode += reward if running_state is not None: next_state = running_state(next_state) if custom_reward is not None: reward = custom_reward.expert_reward(state, action) total_c_reward += reward min_c_reward = min(min_c_reward, reward) max_c_reward = max(max_c_reward, reward) mask = 0 if done else 1 memory.push(state, action, mask, next_state, reward) if render: env.render() if done: break #if t > min_batch_size: # break state = next_state # log stats num_steps += (t + 1) num_episodes += 1 total_reward += reward_episode min_reward = min(min_reward, reward_episode) max_reward = max(max_reward, reward_episode) log['reward_list'].append(reward_episode) log['num_steps'] = num_steps log['num_episodes'] = num_episodes log['total_reward'] = total_reward log['avg_reward'] = total_reward / num_episodes # tbd: num_episodes -> num_steps log['max_reward'] = max_reward log['min_reward'] = min_reward if custom_reward is not None: log['total_c_reward'] = total_c_reward log['avg_c_reward'] = total_c_reward / num_steps log['max_c_reward'] = max_c_reward log['min_c_reward'] = min_c_reward if queue is not None: queue.put([pid, memory, log]) else: return memory, log
def collect_samples(pid, queue, env, policy, custom_reward, mean_action, render, running_state, min_batch_size, aux_running_state, intervention_device=None): torch.randn(pid) log = dict() extra_mem_fields = [] num_steps = 0 total_reward = 0 min_reward = 1e6 max_reward = -1e6 total_c_reward = 0 min_c_reward = 1e6 max_c_reward = -1e6 num_episodes = 0 aux_state = None next_aux_state = None car_racing_env = env.spec.id == 'CarRacing-v0' is_img_state = len(env.observation_space.shape) == 3 if car_racing_env: extra_mem_fields.extend(['aux_state', 'aux_next_state']) if is_img_state: img_t = img_transform(imgnet_means, imgnet_stds) if intervention_device is not None: intervener = Intervener(intervention_device, env.spec.id) extra_mem_fields.append('expert_mask') memory = Memory(extra_mem_fields) while num_steps < min_batch_size: state = env.reset() if car_racing_env: aux_state = np.array([np.linalg.norm(env.car.hull.linearVelocity)]) if running_state is not None: state = running_state(state) if aux_state is not None and aux_running_state is not None: aux_state = aux_running_state(aux_state) reward_episode = 0 for t in range(10000): if is_img_state: state_var = img_t(state).unsqueeze(0) else: state_var = tensor(state).unsqueeze(0) if aux_state is not None: aux_state_var = tensor(aux_state).view(1, -1).to(dtype) with torch.no_grad(): if mean_action: if aux_state is not None: action = policy(state_var, aux_state_var)[0][0].numpy() else: action = policy(state_var)[0][0].numpy() else: if aux_state is not None: action = policy.select_action( state_var, aux_state_var)[0].numpy() else: action = policy.select_action(state_var)[0].numpy() if intervention_device is not None: intervene_action = intervener.get_action() if np.any(intervene_action): action = intervene_action expert_action = 1 time.sleep(intervener.step_delay) else: expert_action = 0 # time.sleep(intervener.step_delay) action = int(action) if policy.is_disc_action else action.astype( np.float64) next_state, reward, done, _ = env.step(action) if car_racing_env: next_aux_state = np.array( [np.linalg.norm(env.car.hull.linearVelocity)]) reward_episode += reward if running_state is not None: next_state = running_state(next_state) if next_aux_state is not None and aux_running_state is not None: next_aux_state = aux_running_state(next_aux_state) if custom_reward is not None: if is_img_state: reward = custom_reward(state, action, aux_state) else: reward = custom_reward(state, action) total_c_reward += reward min_c_reward = min(min_c_reward, reward) max_c_reward = max(max_c_reward, reward) # TODO remove this, temporary for faster testing if t > 100: done = True mask = 0 if done else 1 if is_img_state: mem_state = state_var.squeeze().numpy() mem_next_state = img_t(next_state).numpy() else: mem_state = state mem_next_state = next_state mem_list = [mem_state, action, mask, mem_next_state, reward] if aux_state is not None: mem_list.extend([aux_state, next_aux_state]) if intervention_device is not None: mem_list.append(expert_action) memory.push(*mem_list) if render: env.render() if done: break state = next_state if aux_state is not None: aux_state = next_aux_state # log stats num_steps += (t + 1) num_episodes += 1 total_reward += reward_episode min_reward = min(min_reward, reward_episode) max_reward = max(max_reward, reward_episode) log['num_steps'] = num_steps log['num_episodes'] = num_episodes log['total_reward'] = total_reward log['avg_reward'] = total_reward / num_episodes log['max_reward'] = max_reward log['min_reward'] = min_reward if custom_reward is not None: log['total_c_reward'] = total_c_reward log['avg_c_reward'] = total_c_reward / num_episodes log['max_c_reward'] = max_c_reward log['min_c_reward'] = min_c_reward if queue is not None: queue.put([pid, memory, log]) else: return memory, log
def collect_samples(pid, queue, env, policy_mgr, policy_wrk, custom_reward, mean_action, render, running_state, min_batch_size): torch.randn(pid) log = dict() memory_mgr = Memory() memory_wrk = Memory() num_steps = 0 total_reward = 0 min_reward = 1e6 max_reward = -1e6 total_c_reward = 0 min_c_reward = 1e6 max_c_reward = -1e6 num_episodes = 0 avg_wrk_reward = 0 avg_mgr_reward = 0 mgr_steps = 0 done_count = 0 state, curr_pos = env.reset() while num_steps < min_batch_size: if running_state is not None: state = running_state(state) reward_episode = 0 # Manager state_mgr = tensor(state).unsqueeze(0) with torch.no_grad(): direction = policy_mgr.select_action(state_mgr)[0] direction = int(direction.detach().numpy()) subgoal = get_target(curr_pos, direction) # Worker state_wrk = tensor(np.concatenate((state, subgoal))) for t in range(10000): # Sample Action with torch.no_grad(): if mean_action: action = policy(state_var)[0][0].numpy() else: action = policy_wrk.select_action( state_wrk.unsqueeze(0))[0].numpy() # Take Action next_state, reward, done, info = env.step(action) ## Sparse Rewards dist = np.linalg.norm(info['fingertip'] - info['target']) reward = -1 if (dist > 0.05) else 0 next_state_wrk = np.concatenate((next_state, subgoal)) reward_episode += reward if running_state is not None: next_state = running_state(next_state) if custom_reward is not None: reward = custom_reward(state, action) total_c_reward += reward min_c_reward = min(min_c_reward, reward) max_c_reward = max(max_c_reward, reward) mask_mgr = 0 if done else 1 # Intrinsic Reward and Subgoal Reached Definition reward_wrk = -np.linalg.norm( subgoal - info['fingertip']) + info['reward_ctrl'] subgoal_reached = (-reward_wrk < 0.05) mask_wrk = 0 if (done or subgoal_reached) else 1 # Collect Rollout memory_wrk.push(state_wrk.detach().numpy(), action, mask_wrk, next_state_wrk, reward_wrk) avg_wrk_reward += reward_wrk if render: env.render() if (done or subgoal_reached): break state_wrk = tensor(next_state_wrk) # Manager Rollout next_state_mgr = next_state reward_mgr = reward_episode / 50.0 memory_mgr.push(state, direction, mask_mgr, next_state_mgr, reward_mgr) state = next_state avg_mgr_reward += reward_mgr mgr_steps += 1 # log stats num_steps += (t + 1) if (done): num_episodes += 1 min_reward = min(min_reward, reward_episode) max_reward = max(max_reward, reward_episode) state, curr_pos = env.reset() total_reward += reward_episode else: curr_pos = info['fingertip'] log['num_steps'] = num_steps log['mgr_steps'] = mgr_steps log['num_episodes'] = num_episodes log['total_reward'] = total_reward log['avg_reward'] = total_reward / (num_episodes) log['max_reward'] = max_reward log['min_reward'] = min_reward log['mgr_reward'] = avg_mgr_reward / mgr_steps log['wrk_reward'] = avg_wrk_reward / num_steps if custom_reward is not None: log['total_c_reward'] = total_c_reward log['avg_c_reward'] = total_c_reward / num_steps log['max_c_reward'] = max_c_reward log['min_c_reward'] = min_c_reward if queue is not None: queue.put([pid, memory, log]) else: return memory_mgr, memory_wrk, log
def collect_samples(pid, obs_shape_n, act_shape_n, queue, env, policy, custom_reward, mean_action, tensor, render, running_state, update_rs, min_batch_size, g_itr): n_agents = len(policy) torch.randn(pid, ) log = dict() memory = Memory() num_steps = 0 total_reward = 0 min_reward = 1e6 max_reward = -1e6 total_c_reward = 0 min_c_reward = 1e6 max_c_reward = -1e6 num_episodes = 0 # EPS_MAX = 0.9995 # eps_val = EPS_MAX**float(g_itr) # if eps_val < 0.1: # eps_val = 0.1 # while num_steps < min_batch_size: while num_steps < min_batch_size: state = env.reset() # print(state) # if running_state is not None: # state = running_state(state, update=update_rs) reward_episode = 0 for t in range(10000): num_steps += 1 action = [] rewards = [] state_var = Variable(tensor(state).unsqueeze(0), volatile=True) if mean_action: # never arrived action = policy(state_var)[0].data[0].numpy() else: for i in range(n_agents): # action = policy[i].select_ma_action(state_var, n_agents)[0].numpy() action.append(policy[i].select_action( state_var[:, i, :])[0].numpy()[0]) # freeze # action[0] = 0 # action[1] = 0 # action[2] = 0 # action[0] = 0 # eps = np.random.randn(action.size)*eps_val # action = action + eps # np.clip(action, -1., 1.) # print(action) # action = int(action) if policy.is_disc_action else action.astype(np.float64) one_hot_actions = [] for i in range(n_agents): one_hot_actions.append( index_to_one_hot(action[i], act_shape_n[i])) # print(one_hot_actions) next_state, reward, done, _ = env.step(one_hot_actions) # Added for shaped reward by haiyinpiao. # for punishing the bipedwalker from stucking in where it is originally. # if (next_state[2]<0.2): # reward -=2 # ------------------------- # print(reward) reward_episode += np.mean(reward[3]) # if running_state is not None: # next_state = running_state(next_state, update=update_rs) # if custom_reward is not None: # reward = custom_reward(state, action) # total_c_reward += reward # min_c_reward = min(min_c_reward, reward) # max_c_reward = max(max_c_reward, reward) # mask = 0 if done[0] else 1 mask = done memory.push(state, action, mask, next_state, reward) if render: env.render() # time.sleep(0.1) # done[3] indicates if the good agents caught if done[3] or num_steps >= min_batch_size: break # if done[0]: # break state = next_state # log stats # num_steps += (t + 1) num_episodes += 1 total_reward += reward_episode min_reward = min(min_reward, reward_episode) max_reward = max(max_reward, reward_episode) # print(pid,"collected!") log['num_steps'] = num_steps log['num_episodes'] = num_episodes log['total_reward'] = total_reward log['avg_reward'] = total_reward / num_episodes log['max_reward'] = max_reward log['min_reward'] = min_reward if custom_reward is not None: log['total_c_reward'] = total_c_reward log['avg_c_reward'] = total_c_reward / num_steps log['max_c_reward'] = max_c_reward log['min_c_reward'] = min_c_reward if queue is not None: queue.put([pid, memory, log]) else: return memory, log
def collect_samples(pid, queue, env, policy, custom_reward, mean_action, tensor, render, running_state, update_rs, min_batch_size, logger, position_vector, log_flag=False): torch.randn(pid, ) log = dict() memory = Memory() num_steps = 0 total_reward = 0 min_reward = 1e6 max_reward = -1e6 total_c_reward = 0 min_c_reward = 1e6 max_c_reward = -1e6 num_episodes = 0 while num_steps < min_batch_size: obs = env.reset() #TODO 设置全局变量 position vector #CPG_controller = CPG_network(position_vector) CPG_controller = CPG_network(position_vector) obs1, obs2, rewards, dones, actions, = [], [], [], [], [], # TODO 观测量到NN输入 的转换函数 state = obs2state(obs) obs2.append(state.reshape((1, -1))) # for storage obs1.append(obs.reshape((1, -1))) # for storage if running_state is not None: state = running_state(state, update=update_rs) reward_episode = 0 reward_period = 0 for t in range(10000): state_var = Variable(tensor(state).unsqueeze(0), volatile=True) if t % 1 == 0: if mean_action: action = policy(state_var)[0].data[0].numpy() else: action = policy.select_action(state_var)[0].numpy() rl_action = int( action) if policy.is_disc_action else action.astype( np.float64) # if t%100 == 0: # print('rl = ', rl_action) #rl_action = np.zeros(13) #rl_action = np.array([1,0]) rl_action = np.clip(rl_action, 0, 1) action = CPG_transfer(rl_action, CPG_controller, obs, t) next_state, reward, done, _ = env.step(action) obs = next_state # transfer obs1.append(next_state.reshape((1, -1))) # for storage next_state = obs2state(next_state) obs2.append(next_state.reshape((1, -1))) # for storage actions.append(rl_action.reshape((1, -1))) reward_episode += reward if running_state is not None: next_state = running_state(next_state, update=update_rs) if custom_reward is not None: reward = custom_reward(state, action) total_c_reward += reward min_c_reward = min(min_c_reward, reward) max_c_reward = max(max_c_reward, reward) rewards.append(reward) # for storage dones.append(done) # for storage mask = 0 if done else 1 memory.push(state, rl_action, mask, next_state, reward) if render: env.render() if done: break state = next_state # log stats num_steps += (t + 1) num_episodes += 1 total_reward += reward_episode min_reward = min(min_reward, reward_episode) max_reward = max(max_reward, reward_episode) # log sample data ,just for debug if log_flag: rewards = np.array(rewards, dtype=np.float64) dones = np.array(dones, dtype=np.float64) tmp = np.vstack((rewards, dones)) # states_x, states_y, tmp1 = np.transpose(tmp) actions = np.concatenate(actions) obs1 = np.concatenate(obs1[:-1]) obs2 = np.concatenate(obs2[:-1]) data = np.concatenate((obs1, obs2, actions, tmp1), axis=1) trajectory = {} for j in range(data.shape[0]): for i in range(data.shape[1]): trajectory[i] = data[j][i] logger.log(trajectory) logger.write() log['num_steps'] = num_steps log['num_episodes'] = num_episodes log['total_reward'] = total_reward log['avg_reward'] = total_reward / num_episodes log['max_reward'] = max_reward log['min_reward'] = min_reward if custom_reward is not None: log['total_c_reward'] = total_c_reward log['avg_c_reward'] = total_c_reward / num_steps log['max_c_reward'] = max_c_reward log['min_c_reward'] = min_c_reward if queue is not None: queue.put([pid, memory, log]) else: return memory, log
def collect_samples(pid, queue, env, p_nets, custom_reward, mean_action, render, running_state, min_batch_size, rsi_mem_prev=None): torch.randn(pid) log = dict() memory = Memory() team_reward = 0.0 if args.dec_agents is True: reward_episodes = [0.0] * env.n_agents num_steps = 0 total_reward = 0 min_reward = 1e6 max_reward = -1e6 total_c_reward = 0 min_c_reward = 1e6 max_c_reward = -1e6 num_episodes = 0 while num_steps < min_batch_size: if args.rsi is True and rsi_mem_prev is not None: # randomized starting point. sp = rsi_mem_prev.rsi_state rs = random.sample(sp, 1) rs = rs[0] # print(rs) # exit() state = env.rsi_reset({0: rs[0], 1: rs[1], 2: rs[2], 3: rs[3]}) else: state = env.reset() if running_state is not None: state = running_state(state) team_reward = 0 if args.dec_agents is True: reward_episodes = [0.0] * env.n_agents for t in range(10000): state_var = tensor(state).unsqueeze(0) action = [] with torch.no_grad(): if mean_action: action = policy(state_var)[0][0].numpy() else: for i in range(env.n_agents): action += p_nets[i].select_action(state_var) if args.dec_agents is False: break next_state, reward, done, _ = env.step(action) team_reward += sum(reward) if args.dec_agents is True: reward_episodes += reward reward_episodes = [ i + j for i, j in zip(reward_episodes, reward) ] if running_state is not None: next_state = running_state(next_state) if custom_reward is not None: reward = custom_reward(state, action) total_c_reward += reward min_c_reward = min(min_c_reward, reward) max_c_reward = max(max_c_reward, reward) if args.dec_agents is False: mask = 0 if all(done) else 1 else: mask = [bool(1 - e) for e in done] if args.rsi is True: memory.push(state, action, mask, next_state, reward, env.agent_pos) else: memory.push(state, action, mask, next_state, reward) if render: env.render() if all(done): break state = next_state # log stats num_steps += (t + 1) num_episodes += 1 total_reward += team_reward min_reward = min(min_reward, team_reward) max_reward = max(max_reward, team_reward) log['num_steps'] = num_steps log['num_episodes'] = num_episodes log['total_reward'] = total_reward log['avg_reward'] = total_reward / num_episodes log['max_reward'] = max_reward log['min_reward'] = min_reward if custom_reward is not None: log['total_c_reward'] = total_c_reward log['avg_c_reward'] = total_c_reward / num_steps log['max_c_reward'] = max_c_reward log['min_c_reward'] = min_c_reward if queue is not None: queue.put([pid, memory, log]) else: return memory, log
def collect_samples(pid, queue, env, policy, custom_reward, mean_action, tensor, render, running_state, update_rs, min_batch_size, seed, thread_id, early_stopping=False): torch.randn(pid, ) log = dict() if early_stopping: training = Memory() validation = Memory() memory = Memory() else: memory = Memory() num_steps = 0 total_reward = 0 min_reward = 1e6 max_reward = -1e6 total_c_reward = 0 min_c_reward = 1e6 max_c_reward = -1e6 num_episodes = 0 episode_rewards = [] while num_steps < min_batch_size: #env.seed(seed + thread_id) state = env.reset() #print("state after env.reset():",state) if running_state is not None: state = running_state(state, update=update_rs) reward_episode = 0 # for t in range(min_batch_size - num_steps): state_var = Variable(tensor(state).unsqueeze(0), volatile=True) if mean_action: action = policy(state_var)[0].data[0].numpy() else: action = policy.select_action(state_var)[0].numpy() action = int(action) if policy.is_disc_action else action.astype( np.float64) #print("action:",action) #get env, action and then get reward and next_state next_state, reward, done, _ = env.step(action) #reward sum of this episode reward_episode += reward if running_state is not None: next_state = running_state(next_state, update=update_rs) if custom_reward is not None: reward = custom_reward(state, action) total_c_reward += reward min_c_reward = min(min_c_reward, reward) max_c_reward = max(max_c_reward, reward) mask = 0 if done else 1 if early_stopping: ra = random.random() if ra > 0.8 and len(validation) <= min_batch_size * 0.1: validation.push(state, action, mask, next_state, reward) else: training.push(state, action, mask, next_state, reward) else: memory.push(state, action, mask, next_state, reward) if render: env.render() if done: break state = next_state # log stats #print ('episode length:',t) num_steps += (t + 1) # num_episode = num_trajectories num_episodes += 1 total_reward += reward_episode episode_rewards.append(reward_episode) #print("total_reward:", total_reward) min_reward = min(min_reward, reward_episode) max_reward = max(max_reward, reward_episode) #print (num_steps) log['num_steps'] = min_batch_size log['num_episodes'] = num_episodes log['total_reward'] = total_reward log['episode_rewards'] = episode_rewards log['avg_reward'] = total_reward / num_episodes log['max_reward'] = max_reward log['min_reward'] = min_reward if custom_reward is not None: log['total_c_reward'] = total_c_reward log['avg_c_reward'] = total_c_reward / num_steps log['max_c_reward'] = max_c_reward log['min_c_reward'] = min_c_reward if queue is not None: queue.put([pid, memory, log]) else: if early_stopping: return memory, training, validation, log else: return memory, log
def collect_samples(pid, queue, env, policy, custom_reward, mean_action, render, running_state, min_batch_size): # def cat_s_a(s:torch.tensor, a:int): # batch_size = 1 # label = torch.LongTensor([[a]]) # a = torch.zeros(batch_size, env.action_space.n).scatter_(1, label, 1) # return torch.cat((s, a), 1) def cat_s_a_np(s: np.array, a: int): batch_size = 1 # label = np.array([[a]]) oh = np.zeros((batch_size, env.action_space.n)) oh[0, a] = 1 return np.append(s, oh) torch.randn(pid) log = dict() memory = Memory() num_steps = 0 total_reward = 0 min_reward = 1e6 max_reward = -1e6 total_c_reward = 0 min_c_reward = 1e6 max_c_reward = -1e6 num_episodes = 0 while num_steps < min_batch_size: state = env.reset() if running_state is not None: state = running_state(state) reward_episode = 0 # repeat = 0 # repeat_len = 0 stop = True # batch_size = 1 # label = torch.LongTensor(batch_size, 1).random_() % env.action_space.n # last_action = torch.zeros(batch_size, env.action_space.n).scatter_(1, label, 1) last_action = 0 # ready_to_push = False # reward_period = 0 state = cat_s_a_np(state, last_action) # interval = 0 for t in range(10000): state_var = tensor(state).unsqueeze(0) # state_var = torch.cat((state_var, last_action), 1) # state_var = cat_s_a(state_var, 1) # Learn to stop, else maintain last action with torch.no_grad(): if mean_action: action = policy(state_var)[0][0].numpy() else: # action, repeat = policy.select_action(state_var)[0].numpy() action, stop = policy.select_action(state_var) action = action[0].numpy() stop = stop[0].numpy() # print(action) # print(repeat) # exit() action = int(action) if policy.is_disc_action else action.astype( np.float64) stop = int(stop) # action only updated when necessary. if t == 0 or stop is 1: last_action = action # ready_to_push = True # repeat += 1 assert (last_action is not None) next_state, reward, done, _ = env.step(last_action) next_state = cat_s_a_np(next_state, last_action) reward_episode += reward # reward_period += reward*(args.gamma**(repeat-1)) if running_state is not None: next_state = running_state(next_state) if custom_reward is not None: reward = custom_reward(state, action) total_c_reward += reward min_c_reward = min(min_c_reward, reward) max_c_reward = max(max_c_reward, reward) mask = 0 if done else 1 memory.push(state, last_action, mask, next_state, reward, stop) # if ready_to_push == True or done: # memory.push(state, last_action, mask, next_state, reward_period, stop, repeat) # ready_to_push = False # repeat = 0 # reward_period = 0 # memory.push(state, last_action, mask, next_state, reward, stop) if render: env.render() if done: # print(reward_episode) # print(t) break state = next_state # log stats num_steps += (t + 1) num_episodes += 1 total_reward += reward_episode min_reward = min(min_reward, reward_episode) max_reward = max(max_reward, reward_episode) log['num_steps'] = num_steps log['num_episodes'] = num_episodes log['total_reward'] = total_reward log['avg_reward'] = total_reward / num_episodes log['max_reward'] = max_reward log['min_reward'] = min_reward if custom_reward is not None: log['total_c_reward'] = total_c_reward log['avg_c_reward'] = total_c_reward / num_steps log['max_c_reward'] = max_c_reward log['min_c_reward'] = min_c_reward if queue is not None: queue.put([pid, memory, log]) else: return memory, log
def collect_samples(pid, queue, env, policy, custom_reward, mean_action, render, running_state, min_batch_size, state_only=False, opponent_policy=None, alpha=None, reward_type=None): torch.randn(pid) log = dict() if opponent_policy is None: memory = Memory() else: memory = TwoPlayerMemory() num_steps = 0 total_reward = 0 min_reward = 1e6 max_reward = -1e6 total_c_reward = 0 min_c_reward = 1e6 max_c_reward = -1e6 num_episodes = 0 while num_steps < min_batch_size: state = env.reset() if running_state is not None: state = running_state(state) reward_episode = 0 for t in range(100000): #range(10000): state_var = tensor(state).unsqueeze(0) with torch.no_grad(): if mean_action: action = policy(state_var)[0][0].numpy() else: if opponent_policy is not None: opponent_plays = np.random.choice(2, p=[alpha, 1 - alpha]) opponent_action = opponent_policy.select_action( state_var)[0].numpy() player_action = policy.select_action( state_var)[0].numpy() if opponent_plays: action = copy.deepcopy(opponent_action) else: action = copy.deepcopy(player_action) player_action = int( player_action ) if policy.is_disc_action else player_action.astype( np.float64) opponent_action = int( opponent_action ) if policy.is_disc_action else opponent_action.astype( np.float64) """if np.isnan(player_action).any(): print("Player Nan") player_action = np.zeros_like(player_action) if np.isnan(opponent_action).any(): print("Opponent Nan") opponent_action = np.zeros_like(opponent_action) action = (1 - alpha)*opponent_action.clip(-1.0, 1.0) + alpha*player_action.clip(-1.0, 1.0)""" else: action = policy.select_action(state_var)[0].numpy() action = int(action) if policy.is_disc_action else action.astype( np.float64) if not policy.is_disc_action: action_to_play = action.clip(-1.0, 1.0) next_state, reward, done, _ = env.step(action_to_play) else: next_state, reward, done, _ = env.step(action) reward_episode += reward if running_state is not None: next_state = running_state(next_state) if custom_reward is not None: if state_only: reward = custom_reward(state, next_state, reward_type) else: reward = custom_reward(state, action, reward_type) total_c_reward += reward min_c_reward = min(min_c_reward, reward) max_c_reward = max(max_c_reward, reward) mask = 0 if done else 1 if opponent_policy is not None: memory.push(state, player_action, opponent_action, action, mask, next_state, reward) else: memory.push(state, action, mask, next_state, reward) if render: env.render() if done: if opponent_policy is not None: memory.push(next_state, player_action, opponent_action, action, mask, next_state, reward) else: memory.push(next_state, action, mask, next_state, reward) break state = next_state # log stats num_steps += (t + 1) num_episodes += 1 total_reward += reward_episode min_reward = min(min_reward, reward_episode) max_reward = max(max_reward, reward_episode) log['num_steps'] = num_steps log['num_episodes'] = num_episodes log['total_reward'] = total_reward log['avg_reward'] = total_reward / num_episodes log['max_reward'] = max_reward log['min_reward'] = min_reward if custom_reward is not None: log['total_c_reward'] = total_c_reward log['avg_c_reward'] = total_c_reward / num_steps log['max_c_reward'] = max_c_reward log['min_c_reward'] = min_c_reward if queue is not None: queue.put([pid, memory, log]) else: return memory, log
def collect_samples(pid, queue, env, policy, custom_reward, mean_action, render, running_state, min_batch_size, horizon): torch.randn(pid) log = dict() memory = Memory() num_steps = 0 total_reward = 0 min_reward = 1e6 max_reward = -1e6 total_c_reward = 0 min_c_reward = 1e6 max_c_reward = -1e6 num_episodes = 0 while num_steps < min_batch_size: state = env.reset() if running_state is not None: state = running_state(state) reward_episode = 0 for t in range(0, horizon): state_var = tensor(state).unsqueeze(0) with torch.no_grad(): if mean_action: action = policy(state_var)[0][0].numpy() else: action = policy.select_action(state_var)[0].numpy() action = int(action) if policy.is_discrete_action else action.astype(np.float64) next_state, reward, done, _ = env.step(action) reward_episode += reward if running_state is not None: next_state = running_state(next_state) if custom_reward is not None: reward = custom_reward(state, action) total_c_reward += reward min_c_reward = min(min_c_reward, reward) max_c_reward = max(max_c_reward, reward) mask = 0 if done else 1 memory.push(state, action, mask, next_state, reward) if render: env.render() if done: break state = next_state num_steps += (t + 1) num_episodes += 1 total_reward += reward_episode min_reward = min(min_reward, reward_episode) max_reward = max(max_reward, reward_episode) log["num_steps"] = num_steps log["num_episodes"] = num_episodes log["total_reward"] = total_reward log["avg_reward"] = total_reward / num_episodes log["max_reward"] = max_reward log["min_reward"] = min_reward if custom_reward is not None: log["total_c_reward"] = total_c_reward log["avg_c_reward"] = total_c_reward / num_steps log["max_c_reward"] = max_c_reward log["min_c_reward"] = min_c_reward if queue is not None: queue.put([pid, memory, log]) else: return memory, log
def collect_samples(pid, queue, env, policy_mgr, policy_wrk, custom_reward, mean_action, render, running_state, min_batch_size): torch.randn(pid) log = dict() memory_mgr = Memory() memory_wrk = Memory() num_steps = 0 total_reward = 0 min_reward = 1e6 max_reward = -1e6 total_c_reward = 0 min_c_reward = 1e6 max_c_reward = -1e6 num_episodes = 0 avg_wrk_reward = 0 avg_mgr_reward = 0 mgr_steps = 0 done_count = 0 state = env.reset() while num_steps < min_batch_size: #state_wrk = tensor(state['observation']) #state = np.concatenate((state['observation'],state['desired_goal'])) if running_state is not None: state = running_state(state) reward_episode = 0 state_mgr = tensor( np.concatenate( (state['observation'], state['desired_goal']))).unsqueeze(0) with torch.no_grad(): direction = policy_mgr.select_action(state_mgr)[0] direction = int(direction.detach().numpy()) curr_pos = state['achieved_goal'] subgoal = get_target(curr_pos, direction) state_wrk = tensor( np.concatenate( (state['observation'], state['desired_goal'], subgoal))) for t in range(10000): with torch.no_grad(): if mean_action: action = policy(state_var)[0][0].numpy() else: action = policy_wrk.select_action( state_wrk.unsqueeze(0))[0].numpy() next_state, reward, done, info = env.step(action) # dist = np.linalg.norm(info['fingertip']-info['target']) next_state_wrk = np.concatenate( (next_state['observation'], next_state['desired_goal'], subgoal)) reward_episode += reward if running_state is not None: next_state = running_state(next_state) if custom_reward is not None: reward = custom_reward(state, action) total_c_reward += reward min_c_reward = min(min_c_reward, reward) max_c_reward = max(max_c_reward, reward) mask_mgr = 0 if done else 1 #reward_wrk = - np.linalg.norm(subgoal - next_state['achieved_goal']) reward_wrk = -np.linalg.norm(subgoal - next_state['achieved_goal']) #reward_wrk = reward subgoal_reached = (-reward_wrk < 0.05) mask_wrk = 0 if (done or subgoal_reached) else 1 #mask_wrk = 0 if (done) else 1 memory_wrk.push(state_wrk.detach().numpy(), action, mask_wrk, next_state_wrk, reward_wrk) avg_wrk_reward += reward_wrk if render: env.render() if (done or subgoal_reached): #if (done): break state_wrk = tensor(next_state_wrk) #next_state_mgr = np.concatenate((next_state['observation'],next_state['desired_goal'])) next_state_mgr = next_state['observation'] #reward_mgr = reward_episode - 10*np.linalg.norm(next_state['achieved_goal'] - next_state['desired_goal']) #reward_mgr = reward_episode/50.0 - np.linalg.norm(subgoal - info['target']) reward_mgr = reward_episode / 50.0 memory_mgr.push( np.concatenate((state['observation'], state['desired_goal'])), direction, mask_mgr, next_state_mgr, reward_mgr) state = next_state avg_mgr_reward += reward_mgr mgr_steps += 1 # log stats num_steps += (t + 1) if (done): num_episodes += 1 min_reward = min(min_reward, reward_episode) max_reward = max(max_reward, reward_episode) state = env.reset() curr_pos = state['achieved_goal'] total_reward += reward_episode else: curr_pos = state['achieved_goal'] log['num_steps'] = num_steps log['mgr_steps'] = mgr_steps log['num_episodes'] = num_episodes log['total_reward'] = total_reward log['avg_reward'] = total_reward / (num_episodes) log['max_reward'] = max_reward log['min_reward'] = min_reward log['mgr_reward'] = avg_mgr_reward / mgr_steps log['wrk_reward'] = avg_wrk_reward / num_steps if custom_reward is not None: log['total_c_reward'] = total_c_reward log['avg_c_reward'] = total_c_reward / num_steps log['max_c_reward'] = max_c_reward log['min_c_reward'] = min_c_reward if queue is not None: queue.put([pid, memory, log]) else: return memory_mgr, memory_wrk, log
def collect_samples(pid, queue, env, policy, custom_reward, mean_action, render, running_state, min_batch_size, randomise=False): torch.randn(pid) log = dict() memory = Memory() num_steps = 0 total_reward = 0 min_reward = 1e6 max_reward = -1e6 total_c_reward = 0 min_c_reward = 1e6 max_c_reward = -1e6 num_episodes = 0 best_index = 0 worst_index = 0 episodic_rewards = [] while num_steps < min_batch_size: state = env.reset() if running_state is not None: state = running_state(state) reward_episode = 0 for t in range(2048): state_var = tensor(state).unsqueeze(0) with torch.no_grad(): if mean_action: action = policy(state_var)[0][0].numpy() else: action = policy.select_action(state_var)[0].numpy() action = int(action) if policy.is_disc_action else action.astype( np.float64) if randomise: action = env.action_space.sample() next_state, reward, done, _ = env.step(action) reward_episode += reward if running_state is not None: next_state = running_state(next_state) if custom_reward is not None: reward = custom_reward(state, action) total_c_reward += reward min_c_reward = min(min_c_reward, reward) max_c_reward = max(max_c_reward, reward) if done: mask = 0 else: mask = 1 memory.push(state, action, mask, next_state, reward) if render: env.render() if done: break state = next_state # log stats num_steps += (t + 1) num_episodes += 1 total_reward += reward_episode episodic_rewards.append(reward_episode) min_reward = min(min_reward, reward_episode) max_reward = max(max_reward, reward_episode) log['num_steps'] = num_steps log['num_episodes'] = num_episodes log['total_reward'] = total_reward log['avg_reward'] = total_reward / num_episodes log['max_reward'] = max_reward log['min_reward'] = min_reward log['episodic_rewards'] = episodic_rewards if custom_reward is not None: log['total_c_reward'] = total_c_reward log['avg_c_reward'] = total_c_reward / num_steps log['max_c_reward'] = max_c_reward log['min_c_reward'] = min_c_reward if queue is not None: queue.put([pid, memory, log]) else: return memory, log