def __init__(self, env_name, batch_size, num_workers=mp.cpu_count() - 1): self.env_name = env_name self.batch_size = batch_size self.num_workers = num_workers self.queue = mp.Queue() self.envs = SubprocVecEnv([make_env(env_name) for _ in range(num_workers)], queue=self.queue) self._env = gym.make(env_name)
def train_td3(): env = SubprocVecEnv([lambda: EnvHandler(make_env()) for _ in range(1)]) learn(env, total_timesteps=1e6, nb_epochs=None, nb_rollout_steps=100, max_ep_len=250, reward_scale=1.0, render=False, render_eval=False, noise_type='adaptive-param_0.2', normalize_returns=False, normalize_observations=True, actor_lr=1e-4, critic_lr=1e-3, popart=False, gamma=0.99, clip_norm=None, start_steps=10000, nb_train_steps=50, nb_eval_steps=100, nb_log_steps=100, nb_save_steps=None, batch_size=64, polyak=0.01, action_range=(-250.0, 250.0), observation_range=(-5.0, 5.0), target_noise=0.2, noise_clip=0.5, policy_delay=2, load_path=None, save_dir=None)
def train_ddpg(): env = SubprocVecEnv([lambda: EnvHandler(make_env()) for _ in range(1)]) #env = SubprocVecEnv([lambda: EnvHandler(make_env(env_no=0)), lambda: EnvHandler(make_env(env_no=1))]) learn(env=env, seed=None, total_timesteps=1e5, nb_epochs=None, nb_epoch_cycles=10, nb_rollout_steps=100, reward_scale=1.0, render=False, render_eval=False, noise_type='ou-param_0.2', normalize_returns=False, normalize_observations=False, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=50, nb_eval_steps=100, batch_size=64, tau=0.01, eval_env=None, param_noise_adaption_interval=50, nb_save_epochs=1, save_dir=".", load_path=None)
def reset_task(self, tasks, batch_id, reset_type='learning'): # regenerate new envs to avoid the engine stuck bug! dic_traffic_env_conf_list = [] dic_path_list = [] for task in tasks: dic_agent_conf = copy.deepcopy(self.dic_agent_conf) dic_agent_conf['TRAFFIC_FILE'] = task dic_traffic_env_conf = copy.deepcopy( self.task_traffic_env_map[task]) dic_traffic_env_conf['TRAFFIC_FILE'] = task dic_traffic_env_conf_list.append(dic_traffic_env_conf) dic_path = copy.deepcopy(self.task_path_map[task]) if reset_type == 'test': dic_path["PATH_TO_LOG"] = os.path.join( dic_path['PATH_TO_WORK_DIRECTORY'], reset_type + '_round', task, 'tasks_round_' + str(batch_id)) else: dic_path["PATH_TO_LOG"] = os.path.join( dic_path['PATH_TO_WORK_DIRECTORY'], reset_type + '_round', 'tasks_round_' + str(batch_id), task) dic_path_list.append(dic_path) if not os.path.exists(dic_path['PATH_TO_LOG']): os.makedirs(dic_path['PATH_TO_LOG']) self.envs = SubprocVecEnv(dic_path_list, dic_traffic_env_conf_list, len(tasks), queue=self.queue)
def get_env(env_name, results_save_dir, seed, num_envs): """ Initialize the OpenAI Gym environment. :param env_name: The name of the gym environment to use, (e.g. 'Pong-v0') :param results_save_dir: Output directory for results. :param seed: The random seed. :return: The initialized gym environment. """ # Create the 32 environments to parallelize def make_sub_env_creator(env_num): """ Returns a function that creates an event. """ def sub_env_creator(): sub_env = make_atari(env_name) sub_env.seed(seed + env_num) if env_num == 0 and num_envs > 1: # Wrap first env in default monitor for video output # Results will be transformed into baselines monitor style at the end of the run sub_env = gym.wrappers.Monitor(sub_env, results_save_dir) else: # Wrap every other env in the baselines monitor for equivalent plotting. sub_env = Monitor(sub_env, join(results_save_dir, str(env_num))) sub_env = wrap_deepmind(sub_env, frame_stack=True, scale=True) return sub_env return sub_env_creator envs = [make_sub_env_creator(i) for i in range(num_envs)] return SubprocVecEnv(envs)
def make_vec_envs(env_name, seed, num_processes, gamma, log_dir, add_timestep, device, allow_early_resets, num_frame_stack=None, args=None): envs = [make_env(env_name, seed, i, log_dir, add_timestep, allow_early_resets, map_width=args.map_width, render_gui=args.render, print_map=args.print_map, noreward=args.no_reward, max_step=args.max_step, simple_reward=args.simple_reward, args=args) for i in range(num_processes)] if len(envs) > 1: envs = SubprocVecEnv(envs) else: if sys.version[0] =='2': envs = DummyVecEnv('DummyVecEnv', (), {1:envs}) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: if gamma is None: envs = VecNormalize(envs, ret=False) else: envs = VecNormalize(envs, gamma=gamma) envs = VecPyTorch(envs, device) if num_frame_stack is not None: print('stacking {} frames'.format(num_frame_stack)) envs = VecPyTorchFrameStack(envs, num_frame_stack, device) elif len(envs.observation_space.shape) == 3: envs = VecPyTorchFrameStack(envs, 1, device) return envs
def make_eval_envs(config, how_train, seed, agents, training_agent_ids, acting_agent_ids, num_stack, num_processes, state_directory=None, state_directory_distribution=None): envs = [ _make_eval_env( config=config, how_train=how_train, seed=seed, rank=rank, agents=agents, training_agent_ids=training_agent_ids, acting_agent_ids=acting_agent_ids, num_stack=num_stack, state_directory=state_directory, state_directory_distribution=state_directory_distribution) for rank in range(num_processes) ] return SubprocVecEnv(envs)
def train(env_id, num_timesteps, num_cpu): def make_env(rank): def _thunk(): env = make_atari(env_id) env.seed(SEED + rank) gym.logger.setLevel(logging.WARN) env = wrap_deepmind(env) # wrap the env one more time for getting total reward env = Monitor(env, rank) return env return _thunk env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) learn(CNN, env, SEED, total_timesteps=int(num_timesteps * 1.1)) env.close() pass
def main(env_id, num_timesteps, seed, policy, nstack, nsteps, lrschedule, optimizer, num_cpu, model_file, use_static_wrapper, use_encoded_imagination, use_decoded_imagination): num_timesteps //= 4 assert not (use_encoded_imagination and use_decoded_imagination) def make_env(rank): def _thunk(): env = gym.make(env_id) env.seed(seed + rank) if use_static_wrapper: env = StaticWrapper(env) if policy == 'cnn' or use_encoded_imagination: env = RenderWrapper(env, 400, 600) env = DownsampleWrapper(env, 4) if use_encoded_imagination or use_decoded_imagination: env = FrameStack(env, 3) if use_encoded_imagination: env = EncodedImaginationWrapper(env, model_file, num_cpu) if use_decoded_imagination: env = DecodedImaginationWrapper(env, model_file, num_cpu) gym.logger.setLevel(logging.WARN) return env return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) if policy == 'fc': policy_fn = FcPolicy if policy == 'cnn': policy_fn = CnnPolicy learn(policy_fn, env, seed, nsteps=nsteps, nstack=nstack, total_timesteps=num_timesteps, lrschedule=lrschedule, optimizer=optimizer, max_episode_length=195) env.close()
def dynamics_data_gen(env_name='Reacher-v2', start_seed=0, timesteps=10, n_parallel_envs=1, width=300, height=240): import gym # import locally so that caller can patch gym def make_env(seed): def _(): env = gym.make(env_name) env.seed(seed) return env return _ # Uncomment this to show the bug # from requests_futures.sessions import FuturesSession # session = FuturesSession() # session.get('http://www.google.com', ) from subproc_vec_env import SubprocVecEnv # from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv env = SubprocVecEnv( [make_env(s) for s in range(start_seed, start_seed + n_parallel_envs)]) policy = RandPolicy(env.observation_space, env.action_space, env.num_envs) rollouts = [] obs = env.reset() for i in range(timesteps): # fs = env.render("rgb", width=width, height=height) fs = env.render("rgb_array", width=width, height=height) acs = policy.act(obs) rollouts.append(dict(obs=obs, acs=acs, views=fs)) obs, rewards, dones, infos = env.step(acs) import pandas as pd return {k: np.stack(v) for k, v in pd.DataFrame(rollouts).items()}
class BatchSampler(object): def __init__(self, env_name, batch_size, num_workers=mp.cpu_count() - 1): self.env_name = env_name self.batch_size = batch_size self.num_workers = num_workers self.queue = mp.Queue() self.envs = SubprocVecEnv([make_env(env_name) for _ in range(num_workers)], queue=self.queue) self._env = gym.make(env_name) def sample(self, policy, params=None, gamma=0.95): episodes = BatchEpisodes(batch_size=self.batch_size, gamma=gamma) for i in range(self.batch_size): self.queue.put(i) for _ in range(self.num_workers): self.queue.put(None) observations, batch_ids = self.envs.reset() dones = [False] while (not all(dones)) or (not self.queue.empty()): observations_tensor = observations actions_tensor = policy(observations_tensor, params=params).sample() with tf.device('/CPU:0'): actions = actions_tensor.numpy() new_observations, rewards, dones, new_batch_ids, _ = self.envs.step(actions) episodes.append(observations, actions, rewards, batch_ids) observations, batch_ids = new_observations, new_batch_ids return episodes def reset_task(self, task): tasks = [task for _ in range(self.num_workers)] reset = self.envs.reset_task(tasks) return all(reset) def sample_tasks(self, num_tasks): tasks = self._env.unwrapped.sample_tasks(num_tasks) return tasks
def make_atari_env(env_id, num_env, seed, wrapper_kwargs=None, start_index=0): """ Create a wrapped, monitored SubprocVecEnv for Atari. """ if wrapper_kwargs is None: wrapper_kwargs = {} def make_env(rank): # pylint: disable=C0111 def _thunk(): env = make_atari(env_id) env.seed(seed + rank) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) return wrap_deepmind(env, **wrapper_kwargs) return _thunk set_global_seeds(seed) return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)])
def train(env_id, num_timesteps, seed, num_env, gamma=0.99, ent_coef=0.01, nepochs=4, lr=2.5e-4, next_n=10, seq_len=10, nslupdates=10, K=1): ncpu = multiprocessing.cpu_count() config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) config.gpu_options.allow_growth = True tf.Session(config=config).__enter__() def make_env(rank): def _thunk(): import maze env = maze.MazeEnv(config=open('config/' + env_id + '.xml')) return env return _thunk env = SubprocVecEnv([make_env(i) for i in range(num_env)]) from ppo_diverse import learn policy = LocPolicy learn(policy=policy, env=env, nsteps=128, nminibatches=4, lam=0.95, gamma=gamma, noptepochs=nepochs, ent_coef=ent_coef, lr=lr, cliprange=0.1, total_timesteps=int(num_timesteps * 1.1), next_n=next_n, seq_len=seq_len, nslupdates=nslupdates, K=K, seed=seed)
def train_ppo(): env = SubprocVecEnv([lambda: EnvHandler(make_env())]) learn(env=env, eval_env=None, total_timesteps=3e7, nsteps=128, nminibatches=1, cliprange=0.2, ent_coef=0.01, vf_coef=0.5, lam=0.95, gamma=0.99, noptepochs=4, lr=2.5e-4, save_interval=100, save_dir=".", load_path=None, normalize_observations=False, normalize_returns=False)
def make_atari_env(env_id, num_env, seed, wrapper_kwargs=None, start_index=0): """ Create a wrapped, monitored SubprocVecEnv for Atari. """ if wrapper_kwargs is None: wrapper_kwargs = {} def make_env(rank): # pylint: disable=C0111 def _thunk(): env = retro.make( env_id, use_restricted_actions=retro.ACTIONS_MULTI_DISCRETE) env.seed(seed + rank) return Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) return _thunk set_global_seeds(seed) return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)])
def make_train_envs( config, how_train, seed, game_state_file, training_agents, num_stack, num_processes, do_filter_team=True, state_directory=None, state_directory_distribution=None, step_loss=None, bomb_reward=None, item_reward=None, use_second_place=False, use_both_places=False, frozen_agent=None, mix_frozen_complex=False, florensa_starts_dir=None, ): envs = [ _make_train_env( config=config, how_train=how_train, seed=seed, rank=rank, game_state_file=game_state_file, training_agents=training_agents, num_stack=num_stack, do_filter_team=do_filter_team, state_directory=state_directory, state_directory_distribution=state_directory_distribution, step_loss=step_loss, bomb_reward=bomb_reward, item_reward=item_reward, use_second_place=use_second_place, use_both_places=use_both_places, frozen_agent=frozen_agent, mix_frozen_complex=mix_frozen_complex) for rank in range(num_processes) ] return SubprocVecEnv(envs)
def make_vec_envs(env_name, seed, num_processes, gamma, log_dir, add_timestep, device, allow_early_resets): envs = [ make_env(env_name, seed, i, log_dir, add_timestep, allow_early_resets) for i in range(num_processes) ] if len(envs) > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) envs = VecPyTorch(envs, device) ''' if len(envs.observation_space.shape) == 3: print('Creating frame stacking wrapper') envs = VecPyTorchFrameStack(envs, 4, device) #print(envs.observation_space) ''' return envs
def build_env(args): ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 nenv = args.num_env or ncpu print('num of env: ' + str(nenv)) seed = args.seed env_id = args.env config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) config.gpu_options.allow_growth = True get_session(config=config) env = SubprocVecEnv([ lambda: make_env_from_id(env_id, seed + i if seed is not None else None, "") for i in range(nenv) ]) return env
def reset_task(self, tasks, batch_id, reset_type='learning'): # regenerate new envs to avoid the engine stuck bug! #for i in range(self.num_workers): dic_agent_conf_list = [] dic_traffic_env_conf_list = [] dic_path_list = [] for task in tasks: task_id = self.dic_traffic_env_conf['TRAFFIC_IN_TASKS'].index(task) dic_agent_conf = copy.deepcopy(self.dic_agent_conf) dic_agent_conf['TRAFFIC_FILE'] = task dic_agent_conf_list.append(dic_agent_conf) dic_traffic_env_conf = copy.deepcopy(self.dic_traffic_env_conf) dic_traffic_env_conf['TRAFFIC_FILE'] = task dic_traffic_env_conf_list.append(dic_traffic_env_conf) dic_path = copy.deepcopy(self.dic_path) if reset_type == 'test': dic_path["PATH_TO_LOG"] = os.path.join( dic_path['PATH_TO_WORK_DIRECTORY'], reset_type + '_round', 'task_%d_%s' % (task_id, task), 'tasks_round_' + str(batch_id)) else: dic_path["PATH_TO_LOG"] = os.path.join( dic_path['PATH_TO_WORK_DIRECTORY'], reset_type + '_round', 'tasks_round_' + str(batch_id), 'task_%d_%s' % (task_id, task)) dic_path['PATH_TO_SUMO_CONF'] = os.path.join( dic_path['PATH_TO_WORK_DIRECTORY'], "sumo_conf", task) dic_path_list.append(dic_path) if not os.path.exists(dic_path['PATH_TO_LOG']): os.makedirs(dic_path['PATH_TO_LOG']) self.envs = SubprocVecEnv(dic_path_list, dic_traffic_env_conf_list, len(tasks), queue=self.queue)
def train(model_dict): def update_current_state(current_state, state, channels): # current_state: [processes, channels*stack, height, width] state = torch.from_numpy(state).float() # (processes, channels, height, width) # if num_stack > 1: #first stack*channel-channel frames = last stack*channel-channel , so slide them forward current_state[:, :-channels] = current_state[:, channels:] current_state[:, -channels:] = state #last frame is now the new one return current_state def update_rewards(reward, done, final_rewards, episode_rewards, current_state): # Reward, Done: [P], [P] # final_rewards, episode_rewards: [P,1]. [P,1] # current_state: [P,C*S,H,W] reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1] episode_rewards += reward #keeps track of current episode cumulative reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1] final_rewards *= masks #erase the ones that are done final_rewards += (1 - masks) * episode_rewards #set it to the cumulative episode reward episode_rewards *= masks #erase the done ones masks = masks.type(dtype) #cuda if current_state.dim() == 4: # if state is a frame/image current_state *= masks.unsqueeze(2).unsqueeze(2) #[P,1,1,1] else: current_state *= masks #restart the done ones, by setting the state to zero return reward, masks, final_rewards, episode_rewards, current_state num_frames = model_dict['num_frames'] cuda = model_dict['cuda'] which_gpu = model_dict['which_gpu'] num_steps = model_dict['num_steps'] num_processes = model_dict['num_processes'] seed = model_dict['seed'] env_name = model_dict['env'] save_dir = model_dict['save_to'] num_stack = model_dict['num_stack'] algo = model_dict['algo'] save_interval = model_dict['save_interval'] log_interval = model_dict['log_interval'] save_params = model_dict['save_params'] vid_ = model_dict['vid_'] gif_ = model_dict['gif_'] ls_ = model_dict['ls_'] os.environ['OMP_NUM_THREADS'] = '1' os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu) if cuda: torch.cuda.manual_seed(seed) dtype = torch.cuda.FloatTensor model_dict['dtype']=dtype else: torch.manual_seed(seed) dtype = torch.FloatTensor model_dict['dtype']=dtype # Create environments print (num_processes, 'processes') monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards') if not os.path.exists(monitor_rewards_dir): os.makedirs(monitor_rewards_dir) print ('Made dir', monitor_rewards_dir) envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)]) if vid_: print ('env for video') envs_video = make_env_monitor(env_name, save_dir) if gif_: print ('env for gif') envs_gif = make_env_basic(env_name) if ls_: print ('env for ls') envs_ls = make_env_basic(env_name) obs_shape = envs.observation_space.shape # (channels, height, width) obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]) # (channels*stack, height, width) shape_dim0 = envs.observation_space.shape[0] #channels model_dict['obs_shape']=obs_shape model_dict['shape_dim0']=shape_dim0 # # Create agent # if algo == 'a2c': # agent = a2c(envs, model_dict) # print ('init a2c agent') # elif algo == 'ppo': # agent = ppo(envs, model_dict) # print ('init ppo agent') # elif algo == 'a2c_minibatch': # agent = a2c_minibatch(envs, model_dict) # print ('init a2c_minibatch agent') # elif algo == 'a2c_list_rollout': # agent = a2c_list_rollout(envs, model_dict) # print ('init a2c_list_rollout agent') # elif algo == 'a2c_with_var': # agent = a2c_with_var(envs, model_dict) # print ('init a2c_with_var agent') # elif algo == 'a2c_bin_mask': # agent = a2c_with_var(envs, model_dict) # print ('init a2c_with_var agent') # agent = model_dict['agent'](envs, model_dict) # #Load model # if model_dict['load_params']: # # agent.actor_critic = torch.load(os.path.join(args.load_path)) # # agent.actor_critic = torch.load(args.load_path).cuda() # # print ('loaded ', args.load_path) # if model_dict['load_number'] == 3: # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 3000160, model_dict) # elif model_dict['load_number'] == 6: # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 6000160, model_dict) # elif model_dict['load_number'] == 9: # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 9000160, model_dict) # # else: # # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 8000160, model_dict) # else: # PROBLEM print ('Init expert agent') expert_agent = a2c(envs, model_dict) param_file = home+'/Documents/tmp/breakout_2frames_leakyrelu2/BreakoutNoFrameskip-v4/A2C/seed0/model_params3/model_params9999360.pt' param_dict = torch.load(param_file) expert_agent.actor_critic.load_state_dict(param_dict) print ('loaded params', param_file) expert_agent.actor_critic.cuda() print ('Init imitator agent') imitator_agent = a2c(envs, model_dict) # param_file = home+'/Documents/tmp/breakout_2frames_leakyrelu2/imitator_params.ckpt' # param_dict = torch.load(param_file) # imitator_agent.actor_critic.load_state_dict(param_dict) # print ('loaded params', param_file) imitator_agent.actor_critic.cuda() agent = expert_agent expert_policy = expert_agent.actor_critic imitator_policy = imitator_agent.actor_critic optimizer = optim.Adam(imitator_policy.parameters(), lr=.0005, weight_decay=.00001) total_steps = 0 display_step = 50 # Init state state = envs.reset() # (processes, channels, height, width) current_state = torch.zeros(num_processes, *obs_shape) # (processes, channels*stack, height, width) current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step # These are used to compute average rewards for all processes. episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward final_rewards = torch.zeros([num_processes, 1]) num_updates = int(num_frames) // num_steps // num_processes save_interval_num_updates = int(save_interval /num_processes/num_steps) #Begin training # count =0 start = time.time() start2 = time.time() for j in range(num_updates): for step in range(num_steps): # Act, [P,1], [P], [P,1], [P] # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True)) state__ = Variable(agent.rollouts.states[step]) / 255. value, action, action_log_probs, dist_entropy = agent.act(state__) #, requires_grad=False)#, volatile=True)) # print (action_log_probs.size()) # print (dist_entropy.size()) batch = state__ optimizer.zero_grad() log_dist_expert = expert_policy.action_logdist(batch) log_dist_imitator = imitator_policy.action_logdist(batch) action_dist_kl = torch.sum((log_dist_expert - log_dist_imitator)*torch.exp(log_dist_expert), dim=1) #[B] # elbo, logpx, logpz, logqz, action_dist_kl = self.forward(batch, policy, k=k) loss = torch.mean(action_dist_kl) loss.backward() # nn.utils.clip_grad_norm(self.parameters(), .5) optimizer.step() # if total_steps%display_step==0: # and batch_idx == 0: # # print ('Train Epoch: {}/{}'.format(epoch+1, epochs), # # 'total_epochs {}'.format(total_epochs), # print('LL:{:.4f}'.format(loss.data[0]) # # 'logpx:{:.4f}'.format(logpx.data[0]), # # 'logpz:{:.5f}'.format(logpz.data[0]), # # 'logqz:{:.5f}'.format(logqz.data[0]), # # 'action_kl:{:.4f}'.format(action_dist_kl.data[0]) # ) # total_steps+=1 cpu_actions = action.data.squeeze(1).cpu().numpy() #[P] # cpu_actions = action.data.cpu().numpy() #[P] # print (actions.size()) # Step, S:[P,C,H,W], R:[P], D:[P] state, reward, done, info = envs.step(cpu_actions) # Record rewards and update state reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state) current_state = update_current_state(current_state, state, shape_dim0) # Agent record step # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data) agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy) #, done) #Optimize agent agent.no_update() #agent.update(j,num_updates) # agent.update() #agent.update(j,num_updates) agent.insert_first_state(agent.rollouts.states[-1]) # print ('save_interval_num_updates', save_interval_num_updates) # print ('num_updates', num_updates) # print ('j', j) total_num_steps = (j + 1) * num_processes * num_steps # if total_num_steps % save_interval == 0 and save_dir != "": if j % save_interval_num_updates == 0 and save_dir != "" and j != 0: save_to = home+'/Documents/tmp/breakout_2frames_leakyrelu2/imitator_params_env.ckpt' torch.save(imitator_policy.state_dict(), save_to) print ('saved imitator_policy', save_to) # #Save model # if save_params: # do_params(save_dir, agent, total_num_steps, model_dict) # # save_params_v2(save_dir, agent, total_num_steps, model_dict) #make video if vid_: do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps) #make gif if gif_: do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps) #Print updates if j % log_interval == 0:# and j!=0: end = time.time() to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}, {:.4f}".format(j, total_num_steps, final_rewards.min(), final_rewards.median(), final_rewards.mean(), final_rewards.max(), int(total_num_steps / (end - start)), end - start, end - start2, loss.data[0]) # to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}".format(j, total_num_steps, # final_rewards.min(), # final_rewards.median(), # final_rewards.mean(), # final_rewards.max(), # int(total_num_steps / (end - start)), # end - start, # end - start2) print(to_print_info_string) start2 = time.time() to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, Time" if j % (log_interval*30) == 0: if ls_: do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards) # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated") # print(to_print_info_string + ' LS recorded')#, agent.current_lr) # else: #update plots try: if ls_: update_ls_plot(model_dict) make_plots(model_dict) print(to_print_legend_string + " Plot updated") except: raise #pass print(to_print_legend_string) try: make_plots(model_dict) except: print ()
def train(model_dict): def update_current_state(current_state, state, channels): # current_state: [processes, channels*stack, height, width] state = torch.from_numpy( state).float() # (processes, channels, height, width) # if num_stack > 1: #first stack*channel-channel frames = last stack*channel-channel , so slide them forward current_state[:, :-channels] = current_state[:, channels:] current_state[:, -channels:] = state #last frame is now the new one return current_state def update_rewards(reward, done, final_rewards, episode_rewards, current_state): # Reward, Done: [P], [P] # final_rewards, episode_rewards: [P,1]. [P,1] # current_state: [P,C*S,H,W] reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1] episode_rewards += reward #keeps track of current episode cumulative reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1] final_rewards *= masks #erase the ones that are done final_rewards += ( 1 - masks) * episode_rewards #set it to the cumulative episode reward episode_rewards *= masks #erase the done ones masks = masks.type(dtype) #cuda if current_state.dim() == 4: # if state is a frame/image current_state *= masks.unsqueeze(2).unsqueeze(2) #[P,1,1,1] else: current_state *= masks #restart the done ones, by setting the state to zero return reward, masks, final_rewards, episode_rewards, current_state num_frames = model_dict['num_frames'] cuda = model_dict['cuda'] which_gpu = model_dict['which_gpu'] num_steps = model_dict['num_steps'] num_processes = model_dict['num_processes'] seed = model_dict['seed'] env_name = model_dict['env'] save_dir = model_dict['save_to'] num_stack = model_dict['num_stack'] algo = model_dict['algo'] save_interval = model_dict['save_interval'] log_interval = model_dict['log_interval'] save_params = model_dict['save_params'] vid_ = model_dict['vid_'] gif_ = model_dict['gif_'] ls_ = model_dict['ls_'] os.environ['OMP_NUM_THREADS'] = '1' os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu) if cuda: torch.cuda.manual_seed(seed) dtype = torch.cuda.FloatTensor model_dict['dtype'] = dtype else: torch.manual_seed(seed) dtype = torch.FloatTensor model_dict['dtype'] = dtype # Create environments print(num_processes, 'processes') monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards') if not os.path.exists(monitor_rewards_dir): os.makedirs(monitor_rewards_dir) print('Made dir', monitor_rewards_dir) envs = SubprocVecEnv([ make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes) ]) if vid_: print('env for video') envs_video = make_env_monitor(env_name, save_dir) if gif_: print('env for gif') envs_gif = make_env_basic(env_name) if ls_: print('env for ls') envs_ls = make_env_basic(env_name) obs_shape = envs.observation_space.shape # (channels, height, width) obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:] ) # (channels*stack, height, width) shape_dim0 = envs.observation_space.shape[0] #channels model_dict['obs_shape'] = obs_shape model_dict['shape_dim0'] = shape_dim0 # # Create agent # if algo == 'a2c': # agent = a2c(envs, model_dict) # print ('init a2c agent') # elif algo == 'ppo': # agent = ppo(envs, model_dict) # print ('init ppo agent') # elif algo == 'a2c_minibatch': # agent = a2c_minibatch(envs, model_dict) # print ('init a2c_minibatch agent') # elif algo == 'a2c_list_rollout': # agent = a2c_list_rollout(envs, model_dict) # print ('init a2c_list_rollout agent') # elif algo == 'a2c_with_var': # agent = a2c_with_var(envs, model_dict) # print ('init a2c_with_var agent') # elif algo == 'a2c_bin_mask': # agent = a2c_with_var(envs, model_dict) # print ('init a2c_with_var agent') # agent = model_dict['agent'](envs, model_dict) # #Load model # if model_dict['load_params']: # # agent.actor_critic = torch.load(os.path.join(args.load_path)) # # agent.actor_critic = torch.load(args.load_path).cuda() # # print ('loaded ', args.load_path) # if model_dict['load_number'] == 3: # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 3000160, model_dict) # elif model_dict['load_number'] == 6: # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 6000160, model_dict) # elif model_dict['load_number'] == 9: # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 9000160, model_dict) # # else: # # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 8000160, model_dict) # else: # PROBLEM print('Init expert agent') expert_agent = a2c(envs, model_dict) param_file = home + '/Documents/tmp/breakout_2frames_leakyrelu2/BreakoutNoFrameskip-v4/A2C/seed0/model_params3/model_params9999360.pt' param_dict = torch.load(param_file) expert_agent.actor_critic.load_state_dict(param_dict) print('loaded params', param_file) expert_agent.actor_critic.cuda() print('Init imitator agent') imitator_agent = a2c(envs, model_dict) # param_file = home+'/Documents/tmp/breakout_2frames_leakyrelu2/imitator_params.ckpt' # param_dict = torch.load(param_file) # imitator_agent.actor_critic.load_state_dict(param_dict) # print ('loaded params', param_file) imitator_agent.actor_critic.cuda() agent = expert_agent expert_policy = expert_agent.actor_critic imitator_policy = imitator_agent.actor_critic optimizer = optim.Adam(imitator_policy.parameters(), lr=.0005, weight_decay=.00001) total_steps = 0 display_step = 50 # Init state state = envs.reset() # (processes, channels, height, width) current_state = torch.zeros( num_processes, *obs_shape) # (processes, channels*stack, height, width) current_state = update_current_state( current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest agent.insert_first_state( current_state ) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step # These are used to compute average rewards for all processes. episode_rewards = torch.zeros( [num_processes, 1]) #keeps track of current episode cumulative reward final_rewards = torch.zeros([num_processes, 1]) num_updates = int(num_frames) // num_steps // num_processes save_interval_num_updates = int(save_interval / num_processes / num_steps) #Begin training # count =0 start = time.time() start2 = time.time() for j in range(num_updates): for step in range(num_steps): # Act, [P,1], [P], [P,1], [P] # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True)) state__ = Variable(agent.rollouts.states[step]) / 255. value, action, action_log_probs, dist_entropy = agent.act( state__) #, requires_grad=False)#, volatile=True)) # print (action_log_probs.size()) # print (dist_entropy.size()) batch = state__ optimizer.zero_grad() log_dist_expert = expert_policy.action_logdist(batch) log_dist_imitator = imitator_policy.action_logdist(batch) action_dist_kl = torch.sum((log_dist_expert - log_dist_imitator) * torch.exp(log_dist_expert), dim=1) #[B] # elbo, logpx, logpz, logqz, action_dist_kl = self.forward(batch, policy, k=k) loss = torch.mean(action_dist_kl) loss.backward() # nn.utils.clip_grad_norm(self.parameters(), .5) optimizer.step() # if total_steps%display_step==0: # and batch_idx == 0: # # print ('Train Epoch: {}/{}'.format(epoch+1, epochs), # # 'total_epochs {}'.format(total_epochs), # print('LL:{:.4f}'.format(loss.data[0]) # # 'logpx:{:.4f}'.format(logpx.data[0]), # # 'logpz:{:.5f}'.format(logpz.data[0]), # # 'logqz:{:.5f}'.format(logqz.data[0]), # # 'action_kl:{:.4f}'.format(action_dist_kl.data[0]) # ) # total_steps+=1 cpu_actions = action.data.squeeze(1).cpu().numpy() #[P] # cpu_actions = action.data.cpu().numpy() #[P] # print (actions.size()) # Step, S:[P,C,H,W], R:[P], D:[P] state, reward, done, info = envs.step(cpu_actions) # Record rewards and update state reward, masks, final_rewards, episode_rewards, current_state = update_rewards( reward, done, final_rewards, episode_rewards, current_state) current_state = update_current_state(current_state, state, shape_dim0) # Agent record step # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data) agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy) #, done) #Optimize agent agent.no_update() #agent.update(j,num_updates) # agent.update() #agent.update(j,num_updates) agent.insert_first_state(agent.rollouts.states[-1]) # print ('save_interval_num_updates', save_interval_num_updates) # print ('num_updates', num_updates) # print ('j', j) total_num_steps = (j + 1) * num_processes * num_steps # if total_num_steps % save_interval == 0 and save_dir != "": if j % save_interval_num_updates == 0 and save_dir != "" and j != 0: save_to = home + '/Documents/tmp/breakout_2frames_leakyrelu2/imitator_params_env.ckpt' torch.save(imitator_policy.state_dict(), save_to) print('saved imitator_policy', save_to) # #Save model # if save_params: # do_params(save_dir, agent, total_num_steps, model_dict) # # save_params_v2(save_dir, agent, total_num_steps, model_dict) #make video if vid_: do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps) #make gif if gif_: do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps) #Print updates if j % log_interval == 0: # and j!=0: end = time.time() to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}, {:.4f}".format( j, total_num_steps, final_rewards.min(), final_rewards.median(), final_rewards.mean(), final_rewards.max(), int(total_num_steps / (end - start)), end - start, end - start2, loss.data[0]) # to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}".format(j, total_num_steps, # final_rewards.min(), # final_rewards.median(), # final_rewards.mean(), # final_rewards.max(), # int(total_num_steps / (end - start)), # end - start, # end - start2) print(to_print_info_string) start2 = time.time() to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, Time" if j % (log_interval * 30) == 0: if ls_: do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards) # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated") # print(to_print_info_string + ' LS recorded')#, agent.current_lr) # else: #update plots try: if ls_: update_ls_plot(model_dict) make_plots(model_dict) print(to_print_legend_string + " Plot updated") except: raise #pass print(to_print_legend_string) try: make_plots(model_dict) except: print()
def main(): cumulative_avg_rewards = [] for seed_ in [10, 50, 100, 200, 500]: seed(seed_) set_random_seed(seed_) print("Seed: ", seed_) episode = 0 # initialize environment env_id = get_args().env #env = make_atari(env_id) #env = wrap_deepmind(env, frame_stack=True, clip_rewards=False, episode_life=False) #env = Monitor(env) env = SubprocVecEnv([make_env(seed_, i) for i in range(6)]) #24 print("CHECK_ENV", env.reset().__array__().shape) state_size = env.observation_space.shape[0] action_size = env.action_space.n agent = get_agent(env) save_path = os.path.join('models_entropy_coeff1', "Space_inv_A2C_LSTM_nstep8_MAX_rew_546") agent.load(save_path) lstm_state = np.zeros((6, 256), dtype=np.float32) #24 # run for 100 episodes #for i in range(100): counter = 0 episodic_reward_lis = [] for i in range(wandb.config.episodes): # Set reward received in this episode = 0 at the start of the episode episodic_reward = np.zeros((6)) #24 episodic_reward_m = np.zeros((6)) #24 reset = False #env = gym.wrappers.Monitor(env, 'test/'+str(i), force=True) obs = env.reset() renders = [] count = 0 action_count = 0 done = False done1 = np.zeros(6) #24 done2 = np.zeros(6) #24 while not done: a, v, lstm_state = agent.step(obs, S_=lstm_state, M_=done1) obs, reward, done1, info = env.step(a, done1, cond="eval") done = done2.all() if (done): episodic_reward_m1 = episodic_reward_m.max() break if (done1.any()): episodic_reward_m[np.logical_and( done2 <= 0, done1)] = episodic_reward[np.logical_and( done2 <= 0, done1)] for j in np.nonzero(done1)[0]: episodic_reward[j] = 0 episodic_reward += reward done2 = np.logical_or(done1, done2) if (i == 0): reset = True cumulative_avg_reward = evaluate(episodic_reward_m1, reset) tf.reset_default_graph() env.close() # your models will be evaluated on 100-episode average reward # therefore, we stop logging after 100 episodes print("*************************************************************") print("CUMULATIVE_AVG_REWARD", cumulative_avg_reward) print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%") cumulative_avg_rewards.append(cumulative_avg_reward) print("Final score: ", np.mean(cumulative_avg_rewards))
def train(model_dict): def update_current_state(current_state, state, channels): # current_state: [processes, channels*stack, height, width] state = torch.from_numpy(state).float() # (processes, channels, height, width) # if num_stack > 1: #first stack*channel-channel frames = last stack*channel-channel , so slide them forward current_state[:, :-channels] = current_state[:, channels:] current_state[:, -channels:] = state #last frame is now the new one return current_state def update_rewards(reward, done, final_rewards, episode_rewards, current_state): # Reward, Done: [P], [P] # final_rewards, episode_rewards: [P,1]. [P,1] # current_state: [P,C*S,H,W] reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1] episode_rewards += reward #keeps track of current episode cumulative reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1] final_rewards *= masks #erase the ones that are done final_rewards += (1 - masks) * episode_rewards #set it to the cumulative episode reward episode_rewards *= masks #erase the done ones masks = masks.type(dtype) #cuda # print (current_state) # fdsf if current_state.dim() == 4: # if state is a frame/image current_state *= masks.unsqueeze(2).unsqueeze(2) #[P,1,1,1] else: current_state *= masks #restart the done ones, by setting the state to zero return reward, masks, final_rewards, episode_rewards, current_state num_frames = model_dict['num_frames'] cuda = model_dict['cuda'] which_gpu = model_dict['which_gpu'] num_steps = model_dict['num_steps'] num_processes = model_dict['num_processes'] seed = model_dict['seed'] env_name = model_dict['env'] save_dir = model_dict['save_to'] num_stack = model_dict['num_stack'] algo = model_dict['algo'] save_interval = model_dict['save_interval'] log_interval = model_dict['log_interval'] save_params = model_dict['save_params'] vid_ = model_dict['vid_'] gif_ = model_dict['gif_'] ls_ = model_dict['ls_'] os.environ['OMP_NUM_THREADS'] = '1' os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu) if cuda: torch.cuda.manual_seed(seed) dtype = torch.cuda.FloatTensor model_dict['dtype']=dtype else: torch.manual_seed(seed) dtype = torch.FloatTensor model_dict['dtype']=dtype # Create environments print (num_processes, 'processes') monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards') if not os.path.exists(monitor_rewards_dir): os.makedirs(monitor_rewards_dir) print ('Made dir', monitor_rewards_dir) envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)]) if vid_: print ('env for video') envs_video = make_env_monitor(env_name, save_dir) if gif_: print ('env for gif') envs_gif = make_env_basic(env_name) if ls_: print ('env for ls') envs_ls = make_env_basic(env_name) obs_shape = envs.observation_space.shape # (channels, height, width) obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]) # (channels*stack, height, width) shape_dim0 = envs.observation_space.shape[0] #channels model_dict['obs_shape']=obs_shape model_dict['shape_dim0']=shape_dim0 action_size = envs.action_space.n model_dict['action_size']=action_size # Create agent if algo == 'a2c': agent = a2c(model_dict) print ('init a2c agent') # elif algo == 'ppo': # agent = ppo(envs, model_dict) # print ('init ppo agent') # elif algo == 'a2c_minibatch': # agent = a2c_minibatch(envs, model_dict) # print ('init a2c_minibatch agent') # elif algo == 'a2c_list_rollout': # agent = a2c_list_rollout(envs, model_dict) # print ('init a2c_list_rollout agent') # elif algo == 'a2c_with_var': # agent = a2c_with_var(envs, model_dict) # print ('init a2c_with_var agent') # elif algo == 'a2c_bin_mask': # agent = a2c_with_var(envs, model_dict) # print ('init a2c_with_var agent') # agent = model_dict['agent'](envs, model_dict) #Load model if model_dict['load_params']: # agent.actor_critic = torch.load(os.path.join(args.load_path)) # agent.actor_critic = torch.load(args.load_path).cuda() # print ('loaded ', args.load_path) if model_dict['load_number'] == 3: load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 3000160, model_dict) elif model_dict['load_number'] == 6: load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 6000160, model_dict) elif model_dict['load_number'] == 9: load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 9000160, model_dict) # else: # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 8000160, model_dict) else: PROBLEM # Init state state = envs.reset() # (processes, channels, height, width) current_state = torch.zeros(num_processes, *obs_shape) # (processes, channels*stack, height, width) current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step # These are used to compute average rewards for all processes. episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward final_rewards = torch.zeros([num_processes, 1]) num_updates = int(num_frames) // num_steps // num_processes save_interval_num_updates = int(save_interval /num_processes/num_steps) #Begin training # count =0 start = time.time() start2 = time.time() for j in range(num_updates): for step in range(num_steps): # Act, [P,1], [P], [P,1], [P] # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True)) value, action, action_log_probs, dist_entropy = agent.act(Variable(agent.rollouts.states[step]/255.))#, volatile=True)) # print (action_log_probs.size()) # print (dist_entropy.size()) cpu_actions = action.data.squeeze(1).cpu().numpy() #[P] # cpu_actions = action.data.cpu().numpy() #[P] # print (actions.size()) # Step, S:[P,C,H,W], R:[P], D:[P] state, reward, done, info = envs.step(cpu_actions) # if np.sum(reward) > 0.: # print (reward) # afdas # Record rewards and update state reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state) current_state = update_current_state(current_state, state, shape_dim0) # Agent record step # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data) agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy) #, done) #Optimize agent agent.update() #agent.update(j,num_updates) agent.insert_first_state(agent.rollouts.states[-1]) # print ('save_interval_num_updates', save_interval_num_updates) # print ('num_updates', num_updates) # print ('j', j) total_num_steps = (j + 1) * num_processes * num_steps # if total_num_steps % save_interval == 0 and save_dir != "": if j % save_interval_num_updates == 0 and save_dir != "" and j != 0: #Save model if save_params: # do_params(save_dir, agent, total_num_steps, model_dict) # save_params_v2(save_dir, agent, total_num_steps, model_dict) save_params_v3(save_dir, agent, total_num_steps, model_dict) #make video if vid_: do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps) #make gif if gif_: do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps) #Print updates if j % log_interval == 0:# and j!=0: end = time.time() to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}".format(j, total_num_steps, final_rewards.min(), final_rewards.median(), final_rewards.mean(), final_rewards.max(), int(total_num_steps / (end - start)), end - start, end - start2) print(to_print_info_string) start2 = time.time() to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, Time" if j % (log_interval*30) == 0: if ls_: do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards) # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated") # print(to_print_info_string + ' LS recorded')#, agent.current_lr) # else: #update plots try: if ls_: update_ls_plot(model_dict) make_plots(model_dict) print(to_print_legend_string + " Plot updated") except: raise #pass print(to_print_legend_string) try: make_plots(model_dict) except: print ()
parser.add_argument('--plot-points', type=int, default=20, help='number of plot points (groups with mean, std)') parser.add_argument('--plot-path', type=str, default='ep_reward.png', help='path to save reward plot to') parser.add_argument('--seed', type=int, default=0, help='random seed') args = parser.parse_args() set_seed(args.seed) cuda = torch.cuda.is_available() and not args.no_cuda env_fns = [] for rank in range(args.num_workers): env_fns.append(lambda: make_env(args.env_id, rank, args.seed + rank)) if args.render: venv = RenderSubprocVecEnv(env_fns, args.render_interval) else: venv = SubprocVecEnv(env_fns) venv = VecFrameStack(venv, 4) test_env = make_env(args.env_id, 0, args.seed) test_env = FrameStack(test_env, 4) policy = {'cnn': AtariCNN}[args.arch](venv.action_space.n) policy = cuda_if(policy, cuda) optimizer = optim.Adam(policy.parameters()) if args.lr_func == 'linear': lr_func = lambda a: args.lr * (1. - a) elif args.lr_func == 'constant': lr_func = lambda a: args.lr
def viz(model_dict): def update_current_state(current_state, state, channels): # current_state: [processes, channels*stack, height, width] state = torch.from_numpy(state).float() # (processes, channels, height, width) # if num_stack > 1: #first stack*channel-channel frames = last stack*channel-channel , so slide them forward current_state[:, :-channels] = current_state[:, channels:] current_state[:, -channels:] = state #last frame is now the new one return current_state def update_rewards(reward, done, final_rewards, episode_rewards, current_state): # Reward, Done: [P], [P] # final_rewards, episode_rewards: [P,1]. [P,1] # current_state: [P,C*S,H,W] reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1] episode_rewards += reward #keeps track of current episode cumulative reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1] final_rewards *= masks #erase the ones that are done final_rewards += (1 - masks) * episode_rewards #set it to the cumulative episode reward episode_rewards *= masks #erase the done ones masks = masks.type(dtype) #cuda if current_state.dim() == 4: # if state is a frame/image current_state *= masks.unsqueeze(2).unsqueeze(2) #[P,1,1,1] else: current_state *= masks #restart the done ones, by setting the state to zero return reward, masks, final_rewards, episode_rewards, current_state num_frames = model_dict['num_frames'] cuda = model_dict['cuda'] which_gpu = model_dict['which_gpu'] num_steps = model_dict['num_steps'] num_processes = model_dict['num_processes'] seed = model_dict['seed'] env_name = model_dict['env'] save_dir = model_dict['save_to'] num_stack = model_dict['num_stack'] algo = model_dict['algo'] save_interval = model_dict['save_interval'] log_interval = model_dict['log_interval'] os.environ['OMP_NUM_THREADS'] = '1' os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu) num_processes = 1 model_dict['num_processes'] = 1 model_dict['num_steps'] = max_frames num_steps = max_frames if cuda: torch.cuda.manual_seed(seed) dtype = torch.cuda.FloatTensor else: torch.manual_seed(seed) dtype = torch.FloatTensor # Create environments print (num_processes, 'processes') monitor_rewards_dir = '' envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)]) vid_ = 0 see_frames = 1 if vid_: print ('env for video') envs_video = make_env_monitor(env_name, save_dir) obs_shape = envs.observation_space.shape # (channels, height, width) obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]) # (channels*stack, height, width) shape_dim0 = envs.observation_space.shape[0] #channels model_dict['obs_shape']=obs_shape # Create agent if algo == 'a2c': agent = a2c(envs, model_dict) print ('init a2c agent') elif algo == 'ppo': agent = ppo(envs, model_dict) print ('init ppo agent') elif algo == 'a2c_minibatch': agent = a2c_minibatch(envs, model_dict) print ('init a2c_minibatch agent') # agent = model_dict['agent'](envs, model_dict) #Load model model_params_file = save_dir+ '/model_params/model_params'+str(int(epoch_level))+'.pt' agent.actor_critic = torch.load(model_params_file).cuda() print ('loaded ', model_params_file) # fafdas # frame_path = save_dir+'/frames/' if not os.path.exists(frame_path): os.makedirs(frame_path) print ('Made dir', frame_path) # Init state state = envs.reset() # (processes, channels, height, width) current_state = torch.zeros(num_processes, *obs_shape) # (processes, channels*stack, height, width) current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step # These are used to compute average rewards for all processes. episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward final_rewards = torch.zeros([num_processes, 1]) num_updates = int(num_frames) // num_steps // num_processes #Begin training count =0 start = time.time() for j in range(num_updates): for step in range(num_steps): # if see_frames: #Grayscale # save_frame(state, count) # #RGB # state = envs.render() # print(state.shape) # fdsafa # def get_action_meanings(self): # return [ACTION_MEANING[i] for i in self._action_set] # print (envs.get_action_meanings()) # print (agent.rollouts.states[step].size()) # print ('values', values) # print ('actions', actions) # rows = 1 # cols = 3 # fig = plt.figure(figsize=(8,4), facecolor='white') # # plot frame # ax = plt.subplot2grid((rows,cols), (0,0), frameon=False) # state1 = np.squeeze(state[0]) # ax.imshow(state1, cmap='gray') # ax.set_xticks([]) # ax.set_yticks([]) # # ax.savefig(frame_path+'frame' +str(count)+'.png') # # print ('saved',frame_path+'frame' +str(count)+'.png') # # plt.close(fig) # ax.set_title('State',family='serif') # #plot values histogram # ax = plt.subplot2grid((rows,cols), (0,2), frameon=False) # values = [] # actions = [] # for ii in range(100): # # Act, [P,1], [P,1] # action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True)) # val = value.data.cpu().numpy()[0][0] # act_ = action.data.cpu().numpy()[0][0] # # print ('value', val) # # print ('action', act_) # values.append(val) # actions.append(act_) # weights = np.ones_like(values)/float(len(values)) # ax.hist(values, 50, range=[0.0, 4.], weights=weights) # # ax.set_ylim(top=1.) # ax.set_ylim([0.,1.]) # ax.set_title('Value',family='serif') # #plot actions # ax = plt.subplot2grid((rows,cols), (0,1), frameon=False) # action_prob = agent.actor_critic.action_dist(Variable(agent.rollouts.states[step], volatile=True)) # action_prob = np.squeeze(action_prob.data.cpu().numpy()) # action_size = envs.action_space.n # # print (action_prob.shape) # ax.bar(range(action_size), action_prob) # ax.set_title('Action',family='serif') # # ax.set_xticklabels(['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']) # plt.xticks(range(action_size),['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'R_FIRE', 'L_FIRE'], fontsize=6) # ax.set_ylim([0.,1.]) # # print (action_prob) # # ['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE'] # # fdsfas # plt.tight_layout(pad=3., w_pad=2.5, h_pad=1.0) # plt_path = frame_path+'plt' # plt.savefig(plt_path+str(count)+'.png') # print ('saved',plt_path+str(count)+'.png') # plt.close(fig) # # fsadf count+=1 if count % 10 ==0: print (count) if count > 2: if reward.cpu().numpy() > 0: # print (, reward.cpu().numpy(), count) print (done[0],masks.cpu().numpy(), reward.cpu().numpy(),'reward!!', step) print (np.squeeze(agent.rollouts.rewards.cpu().numpy())) else: print (done[0],masks.cpu().numpy(), reward.cpu().numpy()) # if done[0] or count > max_frames: if count > max_frames: next_value = agent.actor_critic(Variable(agent.rollouts.states[-1], volatile=True))[0].data agent.rollouts.compute_returns(next_value, agent.use_gae, agent.gamma, agent.tau) rollouts_ = np.squeeze(agent.rollouts.returns.cpu().numpy()) rewards_ = np.squeeze(agent.rollouts.rewards.cpu().numpy()) # rollouts_ = np.squeeze(agent.rollouts.returns.cpu().numpy()) # rollouts_ = np.squeeze(agent.rollouts.returns.cpu().numpy()) for jj in range(len(rollouts_)): print (jj, rollouts_[jj], rewards_[jj]) ffsdfa # action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True)) # print ('value', value) # print ('action', action) # action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True)) # print ('value', value) # print ('action', action) action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() #[P] # Step, S:[P,C,H,W], R:[P], D:[P] state, reward, done, info = envs.step(cpu_actions) # Record rewards reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state) # Update state current_state = update_current_state(current_state, state, shape_dim0) # Agent record step agent.insert_data(step, current_state, action.data, value.data, reward, masks) # print (reward) total_num_steps = (j + 1) * num_processes * num_steps
def train(model_dict): def update_current_state(current_state, state, channels): # current_state: [processes, channels*stack, height, width] state = torch.from_numpy( state).float() # (processes, channels, height, width) # if num_stack > 1: #first stack*channel-channel frames = last stack*channel-channel , so slide them forward current_state[:, :-channels] = current_state[:, channels:] current_state[:, -channels:] = state #last frame is now the new one return current_state def update_rewards(reward, done, final_rewards, episode_rewards, current_state): # Reward, Done: [P], [P] # final_rewards, episode_rewards: [P,1]. [P,1] # current_state: [P,C*S,H,W] reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1] episode_rewards += reward #keeps track of current episode cumulative reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1] final_rewards *= masks #erase the ones that are done final_rewards += ( 1 - masks) * episode_rewards #set it to the cumulative episode reward episode_rewards *= masks #erase the done ones masks = masks.type(dtype) #cuda if current_state.dim() == 4: # if state is a frame/image current_state *= masks.unsqueeze(2).unsqueeze(2) #[P,1,1,1] else: current_state *= masks #restart the done ones, by setting the state to zero return reward, masks, final_rewards, episode_rewards, current_state num_frames = model_dict['num_frames'] cuda = model_dict['cuda'] which_gpu = model_dict['which_gpu'] num_steps = model_dict['num_steps'] num_processes = model_dict['num_processes'] seed = model_dict['seed'] env_name = model_dict['env'] save_dir = model_dict['save_to'] num_stack = model_dict['num_stack'] algo = model_dict['algo'] save_interval = model_dict['save_interval'] log_interval = model_dict['log_interval'] save_params = model_dict['save_params'] vid_ = model_dict['vid_'] gif_ = model_dict['gif_'] ls_ = model_dict['ls_'] vae_ = model_dict['vae_'] grad_var_ = model_dict['grad_var_'] os.environ['OMP_NUM_THREADS'] = '1' os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu) if cuda: torch.cuda.manual_seed(seed) dtype = torch.cuda.FloatTensor model_dict['dtype'] = dtype else: torch.manual_seed(seed) dtype = torch.FloatTensor model_dict['dtype'] = dtype # Create environments print(num_processes, 'processes') monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards') if not os.path.exists(monitor_rewards_dir): os.makedirs(monitor_rewards_dir) print('Made dir', monitor_rewards_dir) envs = SubprocVecEnv([ make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes) ]) if vid_: print('env for video') envs_video = make_env_monitor(env_name, save_dir) if gif_: print('env for gif') envs_gif = make_env_basic(env_name) if ls_: print('env for ls') envs_ls = make_env_basic(env_name) if vae_: print('env for vae') envs_vae = make_env_basic(env_name) if grad_var_: print('env for grad_var_') envs_grad_var = make_env_basic(env_name) obs_shape = envs.observation_space.shape # (channels, height, width) obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:] ) # (channels*stack, height, width) shape_dim0 = envs.observation_space.shape[0] #channels model_dict['obs_shape'] = obs_shape model_dict['shape_dim0'] = shape_dim0 model_dict['action_size'] = envs.action_space.n print(envs.action_space.n, 'actions') # next_state_pred_ = 0 # model_dict['next_state_pred_'] = next_state_pred_ # Create agent if algo == 'a2c': agent = a2c(envs, model_dict) print('init a2c agent') discriminator = CNN_Discriminator(model_dict).cuda() print('init discriminator') # elif algo == 'a2c_over': # agent = a2c_over(envs, model_dict) # print ('init a2c_over agent') # elif algo == 'a2c_under': # agent = a2c_under(envs, model_dict) # print ('init a2c_under agent') # elif algo == 'ppo': # agent = ppo(envs, model_dict) # print ('init ppo agent') # elif algo == 'a2c_minibatch': # agent = a2c_minibatch(envs, model_dict) # print ('init a2c_minibatch agent') # elif algo == 'a2c_list_rollout': # agent = a2c_list_rollout(envs, model_dict) # print ('init a2c_list_rollout agent') # elif algo == 'a2c_with_var': # agent = a2c_with_var(envs, model_dict) # print ('init a2c_with_var agent') # elif algo == 'a2c_bin_mask': # agent = a2c_with_var(envs, model_dict) # print ('init a2c_with_var agent') # agent = model_dict['agent'](envs, model_dict) # #Load model # if args.load_path != '': # # agent.actor_critic = torch.load(os.path.join(args.load_path)) # agent.actor_critic = torch.load(args.load_path).cuda() # print ('loaded ', args.load_path) # see_reward_episode = 0 # if 'Montez' in env_name and see_reward_episode: # states_list = [[] for i in range(num_processes)] # view_reward_episode(model_dict=model_dict, frames=[]) # dfasddsf # if vae_: # vae = VAE() # vae.cuda() buffer_ = 1 if buffer_: buffer_states = deque(maxlen=200) buffer_actions = deque(maxlen=200) # Init state state = envs.reset() # (processes, channels, height, width) current_state = torch.zeros( num_processes, *obs_shape) # (processes, channels*stack, height, width) current_state = update_current_state( current_state, state, shape_dim0).type( dtype) #add the new frame, remove oldest, since its a stack agent.insert_first_state( current_state ) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step # These are used to compute average rewards for all processes. episode_rewards = torch.zeros( [num_processes, 1]) #keeps track of current episode cumulative reward final_rewards = torch.zeros([num_processes, 1]) num_updates = int(num_frames) // num_steps // num_processes save_interval_num_updates = int(save_interval / num_processes / num_steps) #Begin training # count =0 start = time.time() start2 = time.time() for j in range(num_updates): # discrim_errors = [] # discrim_errors_reverse = [] # discrim_errors_2step = [] # frames = [] for step in range(num_steps): # Act, [P,1], [P,1], [P,1], [P] state_pytorch = Variable(agent.rollouts.states[step]) value, action, action_log_probs, dist_entropy = agent.act( state_pytorch) #, volatile=True)) # print (action) # fsdaf # Apply to Environment, S:[P,C,H,W], R:[P], D:[P] cpu_actions = action.data.squeeze(1).cpu().numpy() #[P] frame, reward, done, info = envs.step(cpu_actions) # frames.append(torch.FloatTensor(frame)) #[P,1,84,84] # # current_frame = torch.from_numpy(frame) #[P,1,84,84] # current_frame = torch.FloatTensor(frame) #[P,1,84,84] # if step ==0: # prev_frame = torch.FloatTensor(state) #[P,1,84,84] # #Pred action and get error # discrim_error = discriminator.forward(prev_frame, current_frame, action) # discrim_errors.append(discrim_error) # discrim_error_reverse = discriminator.forward(current_frame, prev_frame, action) # discrim_errors_reverse.append(discrim_error_reverse) # # THIS IS TO SEE PREDICTIONS # if step==0: # f = np.reshape(prev_frame[0].numpy(), [84,84]) # f =np.concatenate([f,np.reshape(current_frame[0].numpy(),[84,84])], axis=0) # # f1 = prev_frame[0].numpy() # # f2 = current_frame[0].numpy() # # f = np.reshape(np.concatenate([f1,f2], axis=1), [168,84]) # # print (f.shape) # print (cpu_actions[0]) # # ['NOOP', 'FIRE', 'RIGHT', 'LEFT'] for breakout # #for montezuma # #['NOOP', 'FIRE', 'UP', 'RIGHT', 'LEFT', 'DOWN', 'UPRIGHT', 'UPLEFT', 'DOWNRIGHT', 'DOWNLEFT', # #'UPFIRE', 'RIGHTFIRE', 'LEFTFIRE', 'DOWNFIRE', 'UPRIGHTFIRE', 'UPLEFTFIRE', 'DOWNRIGHTFIRE', 'DOWNLEFTFIRE'] # I think FIRE = JUMP # if step ==2: # print (torch.mean(current_frame-prev_frame)) # fdafds # prev_frame_2step = prev_frame # prev_frame = current_frame # # print (torch.sum(prev_frame_2step), torch.sum(prev_frame)) # fadsa # Record rewards and update state reward, masks, final_rewards, episode_rewards, current_state = update_rewards( reward, done, final_rewards, episode_rewards, current_state) current_state = update_current_state(current_state, frame, shape_dim0) agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy, 0) #, done) # print (f.shape) # rows = 1 # cols = 1 # fig = plt.figure(figsize=(1+cols,5+rows), facecolor='white') # ax = plt.subplot2grid((rows,cols), (0,0), frameon=False) #, rowspan=7) # ax.imshow(f, cmap=plt.get_cmap('gray')) # ax.set_yticks([]) # ax.set_xticks([]) # plt.tight_layout() # plt.savefig(model_dict['exp_path']+'plot.png') # print ('plotted') # fadsfad # if buffer_: # if len(buffer_actions) <100: # buffer_steps = 10 # else: # buffer_steps = 1 buffer_steps = 500 if buffer_: #Insert into buffer buffer_states.append(agent.rollouts.states) buffer_actions.append(agent.rollouts.actions) # print (agent.rollouts.states) # print (agent.rollouts.actions) # fda # print (len(buffer_actions)) #If buffer full enough,sample , predict, optimize # if len(buffer_actions) > 10: if len(buffer_actions) == 100: # if 1: #Num of optimization steps for i in range(buffer_steps): # #Sample batch # states_batch = [] # actions_batch = [] # for bb in range(num_processes): # ind = np.random.randint(len(buffer_actions)) # print (buffer_states[ind].size()) # fadas # states_batch.append(buffer_states[ind]) # actions_batch.append(buffer_actions[ind]) # states_batch = torch.stack(states_batch, dim=1) # actions_batch = torch.stack(actions_batch, dim=1) ind = np.random.randint(len(buffer_actions)) states_batch = buffer_states[ind] actions_batch = buffer_actions[ind] #Optimize action-predictor discrim_errors = discrim_predictions( model_dict, states_batch, actions_batch, discriminator) discriminator.optimize(discrim_errors) if i % 20 == 0: print(i) # print (len(buffer_actions), torch.mean(discrim_errors).data.cpu().numpy()[0]) #Optimize agent discrim_errors = discrim_predictions(model_dict, agent.rollouts.states, agent.rollouts.actions, discriminator) discrim_errors_reverse = discrim_predictions( model_dict, agent.rollouts.states, agent.rollouts.actions, discriminator, reverse=True) if len(buffer_actions) > 100: discriminator.optimize(discrim_errors) agent.update2(discrim_errors, discrim_errors_reverse) #agent.update(j,num_updates) # agent.update2(discrim_errors) #agent.update(j,num_updates) else: discrim_errors = discrim_predictions(model_dict, agent.rollouts.states, agent.rollouts.actions, discriminator) discrim_errors_reverse = discrim_predictions( model_dict, agent.rollouts.states, agent.rollouts.actions, discriminator, reverse=True) #Optimize discriminator discriminator.optimize(discrim_errors) #Optimize agent agent.update2(discrim_errors, discrim_errors_reverse) #agent.update(j,num_updates) # agent.update2(discrim_errors) #agent.update(j,num_updates) agent.insert_first_state(agent.rollouts.states[-1]) # print ('save_interval_num_updates', save_interval_num_updates) # print ('num_updates', num_updates) # print ('j', j) total_num_steps = (j + 1) * num_processes * num_steps # if total_num_steps % save_interval == 0 and save_dir != "": if j % save_interval_num_updates == 0 and save_dir != "" and j != 0: #Save model if save_params: do_params(save_dir, agent, total_num_steps, model_dict) #make video if vid_: do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps) #make gif if gif_: do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps) #make vae prob gif if vae_: do_prob_state(envs_vae, agent, model_dict, vae, update_current_state, total_num_steps) # #make vae prob gif # if grad_var_: # do_grad_var(envs_grad_var, agent, model_dict, update_current_state, total_num_steps) #Print updates if j % log_interval == 0: # and j!=0: end = time.time() to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.2f}, {:.3f}".format( j, total_num_steps, final_rewards.min(), final_rewards.median(), final_rewards.mean(), final_rewards.max(), int(total_num_steps / (end - start)), end - start, end - start2, torch.mean(discrim_errors).data.cpu().numpy()[0]) print(to_print_info_string) # if vae_: # elbo = "{:.2f}".format(elbo.data.cpu().numpy()[0]) # if next_state_pred_: # state_pred_error_print = "{:.2f}".format(agent.state_pred_error.data.cpu().numpy()[0]) # print(to_print_info_string+' '+state_pred_error_print+' '+elbo) # to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, pred_error, elbo" # else: # if vae_: # print(to_print_info_string+' '+elbo) # else: # print(to_print_info_string) to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, discrim_E" #, elbo" start2 = time.time() if j % (log_interval * 30) == 0: if ls_: do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards) # if grad_var_ and j % (log_interval*300) == 0: if grad_var_ and j % (log_interval * 30) == 0: #writes to file do_grad_var(envs_grad_var, agent, model_dict, total_num_steps, update_current_state, update_rewards) # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated") # print(to_print_info_string + ' LS recorded')#, agent.current_lr) # else: #update plots try: if ls_: update_ls_plot(model_dict) # if grad_var_ and j % (log_interval*300) == 0: if grad_var_ and j % (log_interval * 30) == 0: update_grad_plot(model_dict) to_print_legend_string += ' grad_var_plot updated ' make_plots(model_dict) print(to_print_legend_string + " Plot updated") except: raise #pass print(to_print_legend_string + " problem with plot") try: make_plots(model_dict) except: print()
def train(model_dict): def update_current_state(current_state, state, channels): # current_state: [processes, channels*stack, height, width] state = torch.from_numpy(state).float() # (processes, channels, height, width) # if num_stack > 1: #first stack*channel-channel frames = last stack*channel-channel , so slide them forward current_state[:, :-channels] = current_state[:, channels:] current_state[:, -channels:] = state #last frame is now the new one return current_state def update_rewards(reward, done, final_rewards, episode_rewards, current_state): # Reward, Done: [P], [P] # final_rewards, episode_rewards: [P,1]. [P,1] # current_state: [P,C*S,H,W] reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1] episode_rewards += reward #keeps track of current episode cumulative reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1] final_rewards *= masks #erase the ones that are done final_rewards += (1 - masks) * episode_rewards #set it to the cumulative episode reward episode_rewards *= masks #erase the done ones masks = masks.type(dtype) #cuda if current_state.dim() == 4: # if state is a frame/image current_state *= masks.unsqueeze(2).unsqueeze(2) #[P,1,1,1] else: current_state *= masks #restart the done ones, by setting the state to zero return reward, masks, final_rewards, episode_rewards, current_state num_frames = model_dict['num_frames'] cuda = model_dict['cuda'] which_gpu = model_dict['which_gpu'] num_steps = model_dict['num_steps'] num_processes = model_dict['num_processes'] seed = model_dict['seed'] env_name = model_dict['env'] save_dir = model_dict['save_to'] num_stack = model_dict['num_stack'] algo = model_dict['algo'] save_interval = model_dict['save_interval'] log_interval = model_dict['log_interval'] save_params = model_dict['save_params'] vid_ = model_dict['vid_'] gif_ = model_dict['gif_'] ls_ = model_dict['ls_'] os.environ['OMP_NUM_THREADS'] = '1' os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu) if cuda: torch.cuda.manual_seed(seed) dtype = torch.cuda.FloatTensor model_dict['dtype']=dtype else: torch.manual_seed(seed) dtype = torch.FloatTensor model_dict['dtype']=dtype # Create environments print (num_processes, 'processes') monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards') if not os.path.exists(monitor_rewards_dir): os.makedirs(monitor_rewards_dir) print ('Made dir', monitor_rewards_dir) envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)]) if vid_: print ('env for video') envs_video = make_env_monitor(env_name, save_dir) if gif_: print ('env for gif') envs_gif = make_env_basic(env_name) if ls_: print ('env for ls') envs_ls = make_env_basic(env_name) obs_shape = envs.observation_space.shape # (channels, height, width) obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]) # (channels*stack, height, width) shape_dim0 = envs.observation_space.shape[0] #channels model_dict['obs_shape']=obs_shape model_dict['shape_dim0']=shape_dim0 # Create agent if algo == 'a2c': agent = a2c(envs, model_dict) print ('init a2c agent') # elif algo == 'ppo': # agent = ppo(envs, model_dict) # print ('init ppo agent') # elif algo == 'a2c_minibatch': # agent = a2c_minibatch(envs, model_dict) # print ('init a2c_minibatch agent') # elif algo == 'a2c_list_rollout': # agent = a2c_list_rollout(envs, model_dict) # print ('init a2c_list_rollout agent') # elif algo == 'a2c_with_var': # agent = a2c_with_var(envs, model_dict) # print ('init a2c_with_var agent') # elif algo == 'a2c_bin_mask': # agent = a2c_with_var(envs, model_dict) # print ('init a2c_with_var agent') # agent = model_dict['agent'](envs, model_dict) # #Load model # if model_dict['load_params']: # # agent.actor_critic = torch.load(os.path.join(args.load_path)) # # agent.actor_critic = torch.load(args.load_path).cuda() # # print ('loaded ', args.load_path) # if model_dict['load_number'] == 3: # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 3000160, model_dict) # elif model_dict['load_number'] == 6: # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 6000160, model_dict) # elif model_dict['load_number'] == 9: # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 9000160, model_dict) # # else: # # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 8000160, model_dict) # else: # PROBLEM #load model # if model_dict['load_params']: # load_params(thigns) # param_file = home+'/Documents/tmp/breakout_2frames/BreakoutNoFrameskip-v4/A2C/seed0/model_params/model_params9999360.pt' param_file = home+'/Documents/tmp/RoadRunner/RoadRunnerNoFrameskip-v4/A2C/seed1/model_params3/model_params9999360.pt' # pretrained_dict = torch.load(param_file) # object # print (pretrained_dict) # agent_dict = agent.actor_critic.state_dict() #dict # print (agent_dict.keys()) # agent_dict.update(pretrained_dict) # # agent_dict.update(agent.actor_critic) # agent.actor_critic.load_state_dict(agent_dict) param_dict = torch.load(param_file) agent.actor_critic.load_state_dict(param_dict) # agent.actor_critic = torch.load(param_file) agent.actor_critic.cuda() print ('loaded', param_file) # afdsa # Init state state = envs.reset() # (processes, channels, height, width) current_state = torch.zeros(num_processes, *obs_shape) # (processes, channels*stack, height, width) current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step # These are used to compute average rewards for all processes. episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward final_rewards = torch.zeros([num_processes, 1]) num_updates = int(num_frames) // num_steps // num_processes save_interval_num_updates = int(save_interval /num_processes/num_steps) # list of lists, where lists are trajectories. trajectories have actinos and states dataset = [] tmp_trajs = [[] for x in range(num_processes)] dataset_count = 0 done = [0]*num_processes #Begin training # count =0 start = time.time() start2 = time.time() for j in range(num_updates): for step in range(num_steps): # Act, [P,1], [P], [P,1], [P] # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True)) value, action, action_log_probs, dist_entropy = agent.act(Variable(agent.rollouts.states[step]))#, volatile=True)) # print (action_log_probs.size()) # print (dist_entropy.size()) cpu_actions = action.data.squeeze(1).cpu().numpy() #[P] # cpu_actions = action.data.cpu().numpy() #[P] # print (actions.size()) # y = torch.LongTensor(batch_size,1).random_() % nb_digits # # One hot encoding buffer that you create out of the loop and just keep reusing # y_onehot = torch.FloatTensor(batch_size, nb_digits) # # In your for loop # y_onehot.zero_() # y_onehot.scatter_(1, y, 1) states_ = agent.rollouts.states[step].cpu().numpy() #[P,S,84,84] # print (state_t.shape) actions_ = action.data.cpu().numpy() #[P,1] # print (action) # fdsaf #store step for proc in range(num_processes): #add states state_t = states_[proc] action_t = actions_[proc] tmp_trajs[proc].append([action_t, state_t]) if done[proc]: dataset.append(tmp_trajs[proc]) dataset_count += len(tmp_trajs[proc]) tmp_trajs[proc] = [] for ii in range(len(dataset)): print (len(dataset[ii])) if dataset_count > 10000: # pickle.dump( dataset, open(home+'/Documents/tmp/breakout_2frames/breakout_trajectories_10000.pkl', "wb" ) ) pickle.dump( dataset, open(home+'/Documents/tmp/RoadRunner/trajectories_10000.pkl', "wb" ) ) print('saved') # pickle.save(dataset) STOP # Step, S:[P,C,H,W], R:[P], D:[P] state, reward, done, info = envs.step(cpu_actions) # Record rewards and update state reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state) current_state = update_current_state(current_state, state, shape_dim0) # Agent record step # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data) agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy) #, done) # print (len(dataset)) # print () #Optimize agent # agent.update() #agent.update(j,num_updates) agent.insert_first_state(agent.rollouts.states[-1]) # print ('save_interval_num_updates', save_interval_num_updates) # print ('num_updates', num_updates) # print ('j', j) total_num_steps = (j + 1) * num_processes * num_steps # if total_num_steps % save_interval == 0 and save_dir != "": if j % save_interval_num_updates == 0 and save_dir != "" and j != 0: #Save model if save_params: do_params(save_dir, agent, total_num_steps, model_dict) # save_params_v2(save_dir, agent, total_num_steps, model_dict) #make video if vid_: do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps) #make gif if gif_: do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps) #Print updates if j % log_interval == 0:# and j!=0: end = time.time() to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}".format(j, total_num_steps, final_rewards.min(), final_rewards.median(), final_rewards.mean(), final_rewards.max(), int(total_num_steps / (end - start)), end - start, end - start2) print(to_print_info_string) start2 = time.time() to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, Time" if j % (log_interval*30) == 0: if ls_: do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards) # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated") # print(to_print_info_string + ' LS recorded')#, agent.current_lr) # else: #update plots try: if ls_: update_ls_plot(model_dict) make_plots(model_dict) print(to_print_legend_string + " Plot updated") except: raise #pass print(to_print_legend_string) try: make_plots(model_dict) except: print ()
def getEnvs(): envs = [make_env for i in range(0, len(getListOfGames("train")))] print(envs, "******************* ENVS BEFORE SubprocVecEnv **************************") envs = SubprocVecEnv(envs) print(envs, "******************* ENVS AFTER SubprocVecEnv **************************") return envs
def train(model_dict): def update_current_state(current_state, state, channels): # current_state: [processes, channels*stack, height, width] state = torch.from_numpy( state).float() # (processes, channels, height, width) # if num_stack > 1: #first stack*channel-channel frames = last stack*channel-channel , so slide them forward current_state[:, :-channels] = current_state[:, channels:] current_state[:, -channels:] = state #last frame is now the new one return current_state def update_rewards(reward, done, final_rewards, episode_rewards, current_state): # Reward, Done: [P], [P] # final_rewards, episode_rewards: [P,1]. [P,1] # current_state: [P,C*S,H,W] reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1] episode_rewards += reward #keeps track of current episode cumulative reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1] final_rewards *= masks #erase the ones that are done final_rewards += ( 1 - masks) * episode_rewards #set it to the cumulative episode reward episode_rewards *= masks #erase the done ones masks = masks.type(dtype) #cuda if current_state.dim() == 4: # if state is a frame/image current_state *= masks.unsqueeze(2).unsqueeze(2) #[P,1,1,1] else: current_state *= masks #restart the done ones, by setting the state to zero return reward, masks, final_rewards, episode_rewards, current_state num_frames = model_dict['num_frames'] cuda = model_dict['cuda'] which_gpu = model_dict['which_gpu'] num_steps = model_dict['num_steps'] num_processes = model_dict['num_processes'] seed = model_dict['seed'] env_name = model_dict['env'] save_dir = model_dict['save_to'] num_stack = model_dict['num_stack'] algo = model_dict['algo'] save_interval = model_dict['save_interval'] log_interval = model_dict['log_interval'] save_params = model_dict['save_params'] vid_ = model_dict['vid_'] gif_ = model_dict['gif_'] ls_ = model_dict['ls_'] os.environ['OMP_NUM_THREADS'] = '1' os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu) if cuda: torch.cuda.manual_seed(seed) dtype = torch.cuda.FloatTensor model_dict['dtype'] = dtype else: torch.manual_seed(seed) dtype = torch.FloatTensor model_dict['dtype'] = dtype # Create environments print(num_processes, 'processes') monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards') if not os.path.exists(monitor_rewards_dir): os.makedirs(monitor_rewards_dir) print('Made dir', monitor_rewards_dir) envs = SubprocVecEnv([ make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes) ]) if vid_: print('env for video') envs_video = make_env_monitor(env_name, save_dir) if gif_: print('env for gif') envs_gif = make_env_basic(env_name) if ls_: print('env for ls') envs_ls = make_env_basic(env_name) obs_shape = envs.observation_space.shape # (channels, height, width) obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:] ) # (channels*stack, height, width) shape_dim0 = envs.observation_space.shape[0] #channels model_dict['obs_shape'] = obs_shape model_dict['shape_dim0'] = shape_dim0 action_size = envs.action_space.n # Create agent if algo == 'a2c': agent = a2c(envs, model_dict) print('init a2c agent') elif algo == 'ppo': agent = ppo(envs, model_dict) print('init ppo agent') elif algo == 'a2c_minibatch': agent = a2c_minibatch(envs, model_dict) print('init a2c_minibatch agent') elif algo == 'a2c_list_rollout': agent = a2c_list_rollout(envs, model_dict) print('init a2c_list_rollout agent') elif algo == 'a2c_with_var': agent = a2c_with_var(envs, model_dict) print('init a2c_with_var agent') # elif algo == 'a2c_bin_mask': # agent = a2c_with_var(envs, model_dict) # print ('init a2c_with_var agent') # agent = model_dict['agent'](envs, model_dict) #Load model if model_dict['load_params']: # agent.actor_critic = torch.load(os.path.join(args.load_path)) # agent.actor_critic = torch.load(args.load_path).cuda() # print ('loaded ', args.load_path) if model_dict['load_number'] == 3: load_params_v2( home + '/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 3000160, model_dict) elif model_dict['load_number'] == 6: load_params_v2( home + '/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 6000160, model_dict) elif model_dict['load_number'] == 9: load_params_v2( home + '/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 9000160, model_dict) # else: # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 8000160, model_dict) else: PROBLEM ls_path = save_dir + '/V_and_Q_errors/' ls_file = ls_path + 'error_monitor.csv' if not os.path.exists(ls_path): os.makedirs(ls_path) # if print_: print('Made dir', ls_path) # Init state state = envs.reset() # (processes, channels, height, width) current_state = torch.zeros( num_processes, *obs_shape) # (processes, channels*stack, height, width) current_state = update_current_state( current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest agent.insert_first_state( current_state ) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step # These are used to compute average rewards for all processes. episode_rewards = torch.zeros( [num_processes, 1]) #keeps track of current episode cumulative reward final_rewards = torch.zeros([num_processes, 1]) num_updates = int(num_frames) // num_steps // num_processes save_interval_num_updates = int(save_interval / num_processes / num_steps) #Begin training # count =0 start = time.time() start2 = time.time() for j in range(num_updates): Vs = [] Qs = [] for step in range(num_steps): # Act, [P,1], [P], [P,1], [P] # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True)) value, action, action_log_probs, dist_entropy = agent.act( Variable(agent.rollouts.states[step])) #, volatile=True)) # print (action_log_probs.size()) # print (dist_entropy.size()) one_hot_action = torch.FloatTensor(num_processes, action_size) one_hot_action.zero_() one_hot_action.scatter_(1, action.data.cpu(), 1) # print (action) # print (one_hot_action) # fdsfa V, Q = agent.actor_critic.get_V_and_Q( Variable(agent.rollouts.states[step]), one_hot_action) Vs.append(V) Qs.append(Q) cpu_actions = action.data.squeeze(1).cpu().numpy() #[P] # cpu_actions = action.data.cpu().numpy() #[P] # print (actions.size()) # Step, S:[P,C,H,W], R:[P], D:[P] state, reward, done, info = envs.step(cpu_actions) # Record rewards and update state reward, masks, final_rewards, episode_rewards, current_state = update_rewards( reward, done, final_rewards, episode_rewards, current_state) current_state = update_current_state(current_state, state, shape_dim0) # Agent record step # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data) agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy) #, done) #Optimize agent # agent.update() #agent.update(j,num_updates) V_loss, Q_loss = agent.update2(Vs, Qs) #agent.update(j,num_updates) V_loss = V_loss.data.cpu().numpy()[0] Q_loss = Q_loss.data.cpu().numpy()[0] # print (V_loss) # fasd agent.insert_first_state(agent.rollouts.states[-1]) # print ('save_interval_num_updates', save_interval_num_updates) # print ('num_updates', num_updates) # print ('j', j) total_num_steps = (j + 1) * num_processes * num_steps # if total_num_steps % save_interval == 0 and save_dir != "": if j % save_interval_num_updates == 0 and save_dir != "" and j != 0: #Save model if save_params: do_params(save_dir, agent, total_num_steps, model_dict) save_params_v2(save_dir, agent, total_num_steps, model_dict) #make video if vid_: do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps) #make gif if gif_: do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps) #Print updates if j % log_interval == 0: # and j!=0: end = time.time() to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}".format( j, total_num_steps, final_rewards.min(), final_rewards.median(), final_rewards.mean(), final_rewards.max(), int(total_num_steps / (end - start)), end - start, end - start2) print(to_print_info_string) start2 = time.time() to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, Time" if j % (log_interval * 30) == 0: if total_num_steps > 5000: with open(ls_file, 'a') as f: writer = csv.writer(f) writer.writerow([total_num_steps, V_loss, Q_loss]) if ls_: do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards) # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated") # print(to_print_info_string + ' LS recorded')#, agent.current_lr) # else: #update plots try: if ls_: update_ls_plot(model_dict) make_plots(model_dict) if total_num_steps > 5000: update_error_plot(model_dict) print(to_print_legend_string + " Plot updated") except: raise #pass print(to_print_legend_string) try: make_plots(model_dict) except: print()
def train(model_dict): def update_current_state(current_state, state, channels): # current_state: [processes, channels*stack, height, width] state = torch.from_numpy( state).float() # (processes, channels, height, width) # if num_stack > 1: #first stack*channel-channel frames = last stack*channel-channel , so slide them forward current_state[:, :-channels] = current_state[:, channels:] current_state[:, -channels:] = state #last frame is now the new one return current_state def update_rewards(reward, done, final_rewards, episode_rewards, current_state): # Reward, Done: [P], [P] # final_rewards, episode_rewards: [P,1]. [P,1] # current_state: [P,C*S,H,W] reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1] episode_rewards += reward #keeps track of current episode cumulative reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1] final_rewards *= masks #erase the ones that are done final_rewards += ( 1 - masks) * episode_rewards #set it to the cumulative episode reward episode_rewards *= masks #erase the done ones masks = masks.type(dtype) #cuda if current_state.dim() == 4: # if state is a frame/image current_state *= masks.unsqueeze(2).unsqueeze(2) #[P,1,1,1] else: current_state *= masks #restart the done ones, by setting the state to zero return reward, masks, final_rewards, episode_rewards, current_state num_frames = model_dict['num_frames'] cuda = model_dict['cuda'] which_gpu = model_dict['which_gpu'] num_steps = model_dict['num_steps'] num_processes = model_dict['num_processes'] seed = model_dict['seed'] env_name = model_dict['env'] save_dir = model_dict['save_to'] num_stack = model_dict['num_stack'] algo = model_dict['algo'] save_interval = model_dict['save_interval'] log_interval = model_dict['log_interval'] save_params = model_dict['save_params'] vid_ = model_dict['vid_'] gif_ = model_dict['gif_'] ls_ = model_dict['ls_'] vae_ = model_dict['vae_'] os.environ['OMP_NUM_THREADS'] = '1' os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu) if cuda: torch.cuda.manual_seed(seed) dtype = torch.cuda.FloatTensor model_dict['dtype'] = dtype else: torch.manual_seed(seed) dtype = torch.FloatTensor model_dict['dtype'] = dtype # Create environments print(num_processes, 'processes') monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards') if not os.path.exists(monitor_rewards_dir): os.makedirs(monitor_rewards_dir) print('Made dir', monitor_rewards_dir) envs = SubprocVecEnv([ make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes) ]) if vid_: print('env for video') envs_video = make_env_monitor(env_name, save_dir) if gif_: print('env for gif') envs_gif = make_env_basic(env_name) if ls_: print('env for ls') envs_ls = make_env_basic(env_name) if vae_: print('env for vae') envs_vae = make_env_basic(env_name) obs_shape = envs.observation_space.shape # (channels, height, width) obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:] ) # (channels*stack, height, width) shape_dim0 = envs.observation_space.shape[0] #channels model_dict['obs_shape'] = obs_shape model_dict['shape_dim0'] = shape_dim0 next_state_pred_ = 0 model_dict['next_state_pred_'] = next_state_pred_ # Create agent if algo == 'a2c': agent = a2c(envs, model_dict) print('init a2c agent') elif algo == 'ppo': agent = ppo(envs, model_dict) print('init ppo agent') elif algo == 'a2c_minibatch': agent = a2c_minibatch(envs, model_dict) print('init a2c_minibatch agent') elif algo == 'a2c_list_rollout': agent = a2c_list_rollout(envs, model_dict) print('init a2c_list_rollout agent') elif algo == 'a2c_with_var': agent = a2c_with_var(envs, model_dict) print('init a2c_with_var agent') # elif algo == 'a2c_bin_mask': # agent = a2c_with_var(envs, model_dict) # print ('init a2c_with_var agent') # agent = model_dict['agent'](envs, model_dict) # #Load model # if args.load_path != '': # # agent.actor_critic = torch.load(os.path.join(args.load_path)) # agent.actor_critic = torch.load(args.load_path).cuda() # print ('loaded ', args.load_path) # see_reward_episode = 0 # if 'Montez' in env_name and see_reward_episode: # states_list = [[] for i in range(num_processes)] # view_reward_episode(model_dict=model_dict, frames=[]) # dfasddsf if vae_: vae = VAE() vae.cuda() # Init state state = envs.reset() # (processes, channels, height, width) current_state = torch.zeros( num_processes, *obs_shape) # (processes, channels*stack, height, width) current_state = update_current_state( current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest agent.insert_first_state( current_state ) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step # These are used to compute average rewards for all processes. episode_rewards = torch.zeros( [num_processes, 1]) #keeps track of current episode cumulative reward final_rewards = torch.zeros([num_processes, 1]) num_updates = int(num_frames) // num_steps // num_processes save_interval_num_updates = int(save_interval / num_processes / num_steps) # prev_action = Variable(torch.zeros([num_processes, 1]).type(torch.LongTensor)).cuda() #Begin training # count =0 start = time.time() start2 = time.time() for j in range(num_updates): for step in range(num_steps): # Act, [P,1], [P], [P,1], [P] # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True)) state_pytorch = Variable(agent.rollouts.states[step]) value, action, action_log_probs, dist_entropy = agent.act( state_pytorch) #, volatile=True)) # if next_state_pred_: # next_state_prediction = agent.actor_critic.predict_next_state2(state_pytorch, prev_action) # next_state_prediction = 0 # print (action_log_probs.size()) # print (dist_entropy.size()) # prev_action = action # print (next_state_prediction.size()) # [P,1,84,84] # fasd cpu_actions = action.data.squeeze(1).cpu().numpy() #[P] # cpu_actions = action.data.cpu().numpy() #[P] # print (actions.size()) # Step, S:[P,C,H,W], R:[P], D:[P] state, reward, done, info = envs.step(cpu_actions) reward_numpy = reward # Record rewards and update state reward, masks, final_rewards, episode_rewards, current_state = update_rewards( reward, done, final_rewards, episode_rewards, current_state) current_state = update_current_state(current_state, state, shape_dim0) # Agent record step # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data) if next_state_pred_: agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy, next_state_prediction) #, done) agent.rollouts.insert_state_pred(next_state_prediction) else: agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy, 0) #, done) # if 'Montez' in env_name and see_reward_episode: # for state_i in range(len(state)): # if done[state_i]: # states_list[state_i] = [] # else: # states_list[state_i].append(np.squeeze(state[state_i])) # # print (state[state_i].shape) # # fasdf # # print (reward) # if reward_numpy[state_i] >0: # #plot the states of state_i # print (len(states_list[state_i])) # # view_reward_episode(model_dict=model_dict, frames=states_list[state_i][len(states_list[state_i])-100:]) # # view_reward_episode(model_dict=model_dict, frames=states_list[state_i][len(states_list[state_i])-100:]) # view_reward_episode(model_dict=model_dict, frames=states_list[state_i]) # fadsa # # and np.sum(agent.rollouts.rewards.cpu().numpy()) > 0 # # print (np.sum(agent.rollouts.rewards.cpu().numpy())) # # print (j) #Optimize agent agent.update() #agent.update(j,num_updates) batch = agent.rollouts.states # print (batch.size()) # [Steps+1,Processes,Stack,84,84] # remove first state since its repeated, its the last state of last episode # take the first state of the stack for each step #reshape to [P*S,84,84] batch = batch[1:] # [Steps,Processes,Stack,84,84] batch = batch[:, :, 0] # [Steps,Processes,84,84] batch = batch.contiguous().view(-1, 84, 84) # [Steps*Processes,84,84] # print (batch.size()) # fadsa # print (vae) elbo = vae.update(batch) agent.insert_first_state(agent.rollouts.states[-1]) # print (agent.state_pred_error.data.cpu().numpy()) # print ('save_interval_num_updates', save_interval_num_updates) # print ('num_updates', num_updates) # print ('j', j) total_num_steps = (j + 1) * num_processes * num_steps # if total_num_steps % save_interval == 0 and save_dir != "": if j % save_interval_num_updates == 0 and save_dir != "" and j != 0: #Save model if save_params: do_params(save_dir, agent, total_num_steps, model_dict) #make video if vid_: do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps) #make gif if gif_: do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps) #make vae prob gif if vae_: do_prob_state(envs_vae, agent, model_dict, vae, update_current_state, total_num_steps) #Print updates if j % log_interval == 0: # and j!=0: end = time.time() to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.2f}".format( j, total_num_steps, final_rewards.min(), final_rewards.median(), final_rewards.mean(), final_rewards.max(), int(total_num_steps / (end - start)), end - start, end - start2) elbo = "{:.2f}".format(elbo.data.cpu().numpy()[0]) if next_state_pred_: state_pred_error_print = "{:.2f}".format( agent.state_pred_error.data.cpu().numpy()[0]) print(to_print_info_string + ' ' + state_pred_error_print + ' ' + elbo) to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, pred_error, elbo" else: print(to_print_info_string + ' ' + elbo) to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, elbo" start2 = time.time() if j % (log_interval * 30) == 0: if ls_: do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards) # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated") # print(to_print_info_string + ' LS recorded')#, agent.current_lr) # else: #update plots try: if ls_: update_ls_plot(model_dict) make_plots(model_dict) print(to_print_legend_string + " Plot updated") except: raise #pass print(to_print_legend_string) try: make_plots(model_dict) except: print()
def train(model_dict): def update_current_state(current_state, state, channels): # current_state: [processes, channels*stack, height, width] state = torch.from_numpy(state).float() # (processes, channels, height, width) # if num_stack > 1: #first stack*channel-channel frames = last stack*channel-channel , so slide them forward current_state[:, :-channels] = current_state[:, channels:] current_state[:, -channels:] = state #last frame is now the new one return current_state def update_rewards(reward, done, final_rewards, episode_rewards, current_state): # Reward, Done: [P], [P] # final_rewards, episode_rewards: [P,1]. [P,1] # current_state: [P,C*S,H,W] reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1] episode_rewards += reward #keeps track of current episode cumulative reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1] final_rewards *= masks #erase the ones that are done final_rewards += (1 - masks) * episode_rewards #set it to the cumulative episode reward episode_rewards *= masks #erase the done ones masks = masks.type(dtype) #cuda if current_state.dim() == 4: # if state is a frame/image current_state *= masks.unsqueeze(2).unsqueeze(2) #[P,1,1,1] else: current_state *= masks #restart the done ones, by setting the state to zero return reward, masks, final_rewards, episode_rewards, current_state num_frames = model_dict['num_frames'] cuda = model_dict['cuda'] which_gpu = model_dict['which_gpu'] num_steps = model_dict['num_steps'] num_processes = model_dict['num_processes'] seed = model_dict['seed'] env_name = model_dict['env'] save_dir = model_dict['save_to'] num_stack = model_dict['num_stack'] algo = model_dict['algo'] save_interval = model_dict['save_interval'] log_interval = model_dict['log_interval'] save_params = model_dict['save_params'] vid_ = model_dict['vid_'] gif_ = model_dict['gif_'] os.environ['OMP_NUM_THREADS'] = '1' os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu) if cuda: torch.cuda.manual_seed(seed) dtype = torch.cuda.FloatTensor model_dict['dtype']=dtype else: torch.manual_seed(seed) dtype = torch.FloatTensor model_dict['dtype']=dtype # Create environments print (num_processes, 'processes') monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards') if not os.path.exists(monitor_rewards_dir): os.makedirs(monitor_rewards_dir) print ('Made dir', monitor_rewards_dir) envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)]) if vid_: print ('env for video') envs_video = make_env_monitor(env_name, save_dir) if gif_: print ('env for gif') envs_gif = make_env_basic(env_name) obs_shape = envs.observation_space.shape # (channels, height, width) obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]) # (channels*stack, height, width) shape_dim0 = envs.observation_space.shape[0] #channels model_dict['obs_shape']=obs_shape model_dict['shape_dim0']=shape_dim0 # Create agent if algo == 'a2c': agent = a2c(envs, model_dict) print ('init a2c agent') elif algo == 'ppo': agent = ppo(envs, model_dict) print ('init ppo agent') elif algo == 'a2c_minibatch': agent = a2c_minibatch(envs, model_dict) print ('init a2c_minibatch agent') elif algo == 'a2c_list_rollout': agent = a2c_list_rollout(envs, model_dict) print ('init a2c_list_rollout agent') elif algo == 'a2c_with_var': agent = a2c_with_var(envs, model_dict) print ('init a2c_with_var agent') # elif algo == 'a2c_bin_mask': # agent = a2c_with_var(envs, model_dict) # print ('init a2c_with_var agent') # agent = model_dict['agent'](envs, model_dict) # #Load model # if args.load_path != '': # # agent.actor_critic = torch.load(os.path.join(args.load_path)) # agent.actor_critic = torch.load(args.load_path).cuda() # print ('loaded ', args.load_path) # Init state state = envs.reset() # (processes, channels, height, width) current_state = torch.zeros(num_processes, *obs_shape) # (processes, channels*stack, height, width) current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step # These are used to compute average rewards for all processes. episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward final_rewards = torch.zeros([num_processes, 1]) num_updates = int(num_frames) // num_steps // num_processes #Begin training # count =0 start = time.time() for j in range(num_updates): for step in range(num_steps): # Act, [P,1], [P], [P,1], [P] # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True)) value, action, action_log_probs, dist_entropy = agent.act(Variable(agent.rollouts.states[step]))#, volatile=True)) # print (action_log_probs.size()) # print (dist_entropy.size()) cpu_actions = action.data.squeeze(1).cpu().numpy() #[P] # cpu_actions = action.data.cpu().numpy() #[P] # print (actions.size()) # Step, S:[P,C,H,W], R:[P], D:[P] state, reward, done, info = envs.step(cpu_actions) # Record rewards and update state reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state) current_state = update_current_state(current_state, state, shape_dim0) # Agent record step # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data) agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy, done) #Optimize agent agent.update() #agent.update(j,num_updates) agent.insert_first_state(agent.rollouts.states[-1]) total_num_steps = (j + 1) * num_processes * num_steps if total_num_steps % save_interval == 0 and save_dir != "": #Save model if save_params: do_params(save_dir, agent, total_num_steps, model_dict) #make video if vid_: do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps) #make gif if gif_: do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps) #Print updates if j % log_interval == 0: end = time.time() if j % (log_interval*30) == 0: #update plots try: make_plots(model_dict) print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated") except: # raise print("Upts, n_timesteps, min/med/mean/max, FPS, Time") print("{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}". format(j, total_num_steps, final_rewards.min(), final_rewards.median(), final_rewards.mean(), final_rewards.max(), int(total_num_steps / (end - start)), end - start))#, agent.current_lr) try: make_plots(model_dict) except: print ()
def train(model_dict): def update_current_state(current_state, state, channels): # current_state: [processes, channels*stack, height, width] state = torch.from_numpy( state).float() # (processes, channels, height, width) # if num_stack > 1: #first stack*channel-channel frames = last stack*channel-channel , so slide them forward current_state[:, :-channels] = current_state[:, channels:] current_state[:, -channels:] = state #last frame is now the new one # if see_frames: # #Grayscale # save_frame(state, count) # count+=1 # if done[0]: # ffsdfa # #RGB # state = envs.render() # print(state.shape) # fdsafa return current_state def update_rewards(reward, done, final_rewards, episode_rewards, current_state): # Reward, Done: [P], [P] # final_rewards, episode_rewards: [P,1]. [P,1] # current_state: [P,C*S,H,W] reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1] episode_rewards += reward #keeps track of current episode cumulative reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1] final_rewards *= masks #erase the ones that are done final_rewards += ( 1 - masks) * episode_rewards #set it to the cumulative episode reward episode_rewards *= masks #erase the done ones masks = masks.type(dtype) #cuda if current_state.dim() == 4: # if state is a frame/image current_state *= masks.unsqueeze(2).unsqueeze(2) #[P,1,1,1] else: current_state *= masks #restart the done ones, by setting the state to zero return reward, masks, final_rewards, episode_rewards, current_state def do_vid(): n_vids = 3 for i in range(n_vids): done = False state = envs_video.reset() # state = torch.from_numpy(state).float().type(dtype) current_state = torch.zeros(1, *obs_shape) current_state = update_current_state(current_state, state, shape_dim0).type(dtype) # print ('Recording') # count=0 while not done: # print (count) # count +=1 # Act state_var = Variable(current_state, volatile=True) # print (state_var.size()) action, value = agent.act(state_var) cpu_actions = action.data.squeeze(1).cpu().numpy() # Observe reward and next state state, reward, done, info = envs_video.step( cpu_actions) # state:[nProcesss, ndims, height, width] # state = torch.from_numpy(state).float().type(dtype) # current_state = torch.zeros(1, *obs_shape) current_state = update_current_state(current_state, state, shape_dim0).type(dtype) state = envs_video.reset() vid_path = save_dir + '/videos/' count = 0 for aaa in os.listdir(vid_path): if 'openaigym' in aaa and '.mp4' in aaa: #os.rename(vid_path+aaa, vid_path+'vid_t'+str(total_num_steps)+'.mp4') subprocess.call("(cd " + vid_path + " && mv " + vid_path + aaa + " " + vid_path + env_name + '_' + algo + '_vid_t' + str(total_num_steps) + '_' + str(count) + ".mp4)", shell=True) count += 1 if '.json' in aaa: os.remove(vid_path + aaa) def save_frame(state, count): frame_path = save_dir + '/frames/' if not os.path.exists(frame_path): os.makedirs(frame_path) print('Made dir', frame_path) state1 = np.squeeze(state[0]) # print (state1.shape) fig = plt.figure(figsize=(4, 4), facecolor='white') plt.imshow(state1, cmap='gray') plt.savefig(frame_path + 'frame' + str(count) + '.png') print('saved', frame_path + 'frame' + str(count) + '.png') plt.close(fig) num_frames = model_dict['num_frames'] cuda = model_dict['cuda'] which_gpu = model_dict['which_gpu'] num_steps = model_dict['num_steps'] num_processes = model_dict['num_processes'] seed = model_dict['seed'] env_name = model_dict['env'] save_dir = model_dict['save_to'] num_stack = model_dict['num_stack'] algo = model_dict['algo'] save_interval = model_dict['save_interval'] log_interval = model_dict['log_interval'] os.environ['OMP_NUM_THREADS'] = '1' os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu) if cuda: torch.cuda.manual_seed(seed) dtype = torch.cuda.FloatTensor else: torch.manual_seed(seed) dtype = torch.FloatTensor # Create environments print(num_processes, 'processes') monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards') if not os.path.exists(monitor_rewards_dir): os.makedirs(monitor_rewards_dir) print('Made dir', monitor_rewards_dir) envs = SubprocVecEnv([ make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes) ]) vid_ = 1 see_frames = 0 if vid_: print('env for video') envs_video = make_env_monitor(env_name, save_dir) obs_shape = envs.observation_space.shape # (channels, height, width) obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:] ) # (channels*stack, height, width) shape_dim0 = envs.observation_space.shape[0] #channels model_dict['obs_shape'] = obs_shape # Create agent if algo == 'a2c': agent = a2c(envs, model_dict) print('init a2c agent') elif algo == 'ppo': agent = ppo(envs, model_dict) print('init ppo agent') elif algo == 'a2c_minibatch': agent = a2c_minibatch(envs, model_dict) print('init a2c_minibatch agent') # agent = model_dict['agent'](envs, model_dict) # #Load model # if args.load_path != '': # # agent.actor_critic = torch.load(os.path.join(args.load_path)) # agent.actor_critic = torch.load(args.load_path).cuda() # print ('loaded ', args.load_path) # Init state state = envs.reset() # (processes, channels, height, width) current_state = torch.zeros( num_processes, *obs_shape) # (processes, channels*stack, height, width) current_state = update_current_state( current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest agent.insert_first_state( current_state ) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step # These are used to compute average rewards for all processes. episode_rewards = torch.zeros( [num_processes, 1]) #keeps track of current episode cumulative reward final_rewards = torch.zeros([num_processes, 1]) num_updates = int(num_frames) // num_steps // num_processes #Begin training # count =0 start = time.time() for j in range(num_updates): for step in range(num_steps): # Act, [P,1], [P,1] action, value = agent.act( Variable(agent.rollouts.states[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() #[P] # Step, S:[P,C,H,W], R:[P], D:[P] state, reward, done, info = envs.step(cpu_actions) # Record rewards reward, masks, final_rewards, episode_rewards, current_state = update_rewards( reward, done, final_rewards, episode_rewards, current_state) # Update state current_state = update_current_state(current_state, state, shape_dim0) # Agent record step agent.insert_data(step, current_state, action.data, value.data, reward, masks) #Optimize agent agent.update() #agent.update(j,num_updates) agent.insert_first_state(agent.rollouts.states[-1]) total_num_steps = (j + 1) * num_processes * num_steps #Save model if total_num_steps % save_interval == 0 and save_dir != "": save_path = os.path.join(save_dir, 'model_params') try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = agent.actor_critic if cuda: save_model = copy.deepcopy(agent.actor_critic).cpu() # torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) # steps_sci_nota = '{e}'.format(total_num_steps) save_to = os.path.join( save_path, "model_params" + str(total_num_steps) + ".pt") # save_to=os.path.join(save_path, "model_params" + steps_sci_nota+".pt") torch.save(save_model, save_to) print('saved', save_to) #make video if vid_: do_vid() #Print updates if j % log_interval == 0: end = time.time() if j % (log_interval * 30) == 0: #update plots try: make_plots(model_dict) print( "Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated" ) except: raise print("Upts, n_timesteps, min/med/mean/max, FPS, Time") print("{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}".format( j, total_num_steps, final_rewards.min(), final_rewards.median(), final_rewards.mean(), final_rewards.max(), int(total_num_steps / (end - start)), end - start)) #, agent.current_lr) try: make_plots(model_dict) except: print()
def train(model_dict): def update_current_state(current_state, state, channels): # current_state: [processes, channels*stack, height, width] state = torch.from_numpy(state).float() # (processes, channels, height, width) # if num_stack > 1: #first stack*channel-channel frames = last stack*channel-channel , so slide them forward current_state[:, :-channels] = current_state[:, channels:] current_state[:, -channels:] = state #last frame is now the new one return current_state def update_rewards(reward, done, final_rewards, episode_rewards, current_state): # Reward, Done: [P], [P] # final_rewards, episode_rewards: [P,1]. [P,1] # current_state: [P,C*S,H,W] reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1] episode_rewards += reward #keeps track of current episode cumulative reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1] final_rewards *= masks #erase the ones that are done final_rewards += (1 - masks) * episode_rewards #set it to the cumulative episode reward episode_rewards *= masks #erase the done ones masks = masks.type(dtype) #cuda if current_state.dim() == 4: # if state is a frame/image current_state *= masks.unsqueeze(2).unsqueeze(2) #[P,1,1,1] else: current_state *= masks #restart the done ones, by setting the state to zero return reward, masks, final_rewards, episode_rewards, current_state num_frames = model_dict['num_frames'] cuda = model_dict['cuda'] which_gpu = model_dict['which_gpu'] num_steps = model_dict['num_steps'] num_processes = model_dict['num_processes'] seed = model_dict['seed'] env_name = model_dict['env'] save_dir = model_dict['save_to'] num_stack = model_dict['num_stack'] algo = model_dict['algo'] save_interval = model_dict['save_interval'] log_interval = model_dict['log_interval'] save_params = model_dict['save_params'] vid_ = model_dict['vid_'] gif_ = model_dict['gif_'] ls_ = model_dict['ls_'] vae_ = model_dict['vae_'] grad_var_ = model_dict['grad_var_'] os.environ['OMP_NUM_THREADS'] = '1' os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu) if cuda: torch.cuda.manual_seed(seed) dtype = torch.cuda.FloatTensor model_dict['dtype']=dtype else: torch.manual_seed(seed) dtype = torch.FloatTensor model_dict['dtype']=dtype # Create environments print (num_processes, 'processes') monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards') if not os.path.exists(monitor_rewards_dir): os.makedirs(monitor_rewards_dir) print ('Made dir', monitor_rewards_dir) envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)]) if vid_: print ('env for video') envs_video = make_env_monitor(env_name, save_dir) if gif_: print ('env for gif') envs_gif = make_env_basic(env_name) # if ls_: # print ('env for ls') # envs_ls = make_env_basic(env_name) # if vae_: # print ('env for vae') # envs_vae = make_env_basic(env_name) # if grad_var_: # print ('env for grad_var_') # envs_grad_var = make_env_basic(env_name) obs_shape = envs.observation_space.shape # (channels, height, width) obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]) # (channels*stack, height, width) shape_dim0 = envs.observation_space.shape[0] #channels model_dict['obs_shape']=obs_shape model_dict['shape_dim0']=shape_dim0 model_dict['action_size'] = envs.action_space.n print (envs.action_space.n, 'actions') # Create agent if algo == 'a2c': agent = a2c(envs, model_dict) print ('init a2c agent') elif algo == 'dqn': agent = DQN(envs, model_dict) print ('init DQN agent') print (agent.q_net) # Init state state = envs.reset() # (processes, channels, height, width) current_state = torch.zeros(num_processes, *obs_shape) # (processes, channels*stack, height, width) current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest, since its a stack # agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step # These are used to compute average rewards for all processes. episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward final_rewards = torch.zeros([num_processes, 1]) num_updates = int(num_frames) // num_steps // num_processes save_interval_num_updates = int(save_interval /num_processes/num_steps) # dqn_epsilon = .1 #lower means less likely to do random .9 # .1 epsilon_start = 1.0 epsilon_final = 0.01 epsilon_decay = 50000 epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) #Begin training # count =0 start = time.time() start2 = time.time() for j in range(num_updates): dqn_epsilon = epsilon_by_frame(j) #Num steps till agent update # for step in range(num_steps): # Act, [P,1], [P,1], [P,1], [P] # state_pytorch = Variable(agent.rollouts.states[step]) state_pytorch = Variable(current_state) # value, action, action_log_probs, dist_entropy = agent.act(state_pytorch, epsilon=dqn_epsilon)#, volatile=True)) action = agent.act(state_pytorch, epsilon=dqn_epsilon)#, volatile=True)) # Apply to Environment, S:[P,C,H,W], R:[P], D:[P] # cpu_actions = action.data.squeeze(1).cpu().numpy() #[P] frame, reward, done, info = envs.step(action) # Record rewards and update state reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state) new_current_state = update_current_state(current_state, frame, shape_dim0) agent.replay_buffer.push(current_state, action, reward, new_current_state, done.astype(int)) current_state = new_current_state if len(agent.replay_buffer) > 100: agent.update() # agent.update() # agent.update() # agent.update() # print ('save_interval_num_updates', save_interval_num_updates) # print ('num_updates', num_updates) # print ('j', j) total_num_steps = (j + 1) * num_processes * num_steps # if total_num_steps % save_interval == 0 and save_dir != "": if j % save_interval_num_updates == 0 and save_dir != "" and j != 0: #Save model if save_params: do_params(save_dir, agent, total_num_steps, model_dict) #make video if vid_: do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps) #make gif if gif_: do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps) #make vae prob gif if vae_: do_prob_state(envs_vae, agent, model_dict, vae, update_current_state, total_num_steps) # #make vae prob gif # if grad_var_: # do_grad_var(envs_grad_var, agent, model_dict, update_current_state, total_num_steps) #Print updates if j % log_interval == 0:# and j!=0: end = time.time() to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.2f}, {:.2f}, {:.5f}".format(j, total_num_steps, final_rewards.min(), final_rewards.median(), final_rewards.mean(), final_rewards.max(), int(total_num_steps / (end - start)), end - start, end - start2, dqn_epsilon, agent.loss.data.cpu().numpy()[0]) # torch.mean(discrim_errors).data.cpu().numpy()[0]) print(to_print_info_string) # if vae_: # elbo = "{:.2f}".format(elbo.data.cpu().numpy()[0]) # if next_state_pred_: # state_pred_error_print = "{:.2f}".format(agent.state_pred_error.data.cpu().numpy()[0]) # print(to_print_info_string+' '+state_pred_error_print+' '+elbo) # to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, pred_error, elbo" # else: # if vae_: # print(to_print_info_string+' '+elbo) # else: # print(to_print_info_string) to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, discrim_E"#, elbo" start2 = time.time() if j % (log_interval*30) == 0: if ls_: do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards) # if grad_var_ and j % (log_interval*300) == 0: if grad_var_ and j % (log_interval*30) == 0: #writes to file do_grad_var(envs_grad_var, agent, model_dict, total_num_steps, update_current_state, update_rewards) # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated") # print(to_print_info_string + ' LS recorded')#, agent.current_lr) # else: #update plots try: if ls_: update_ls_plot(model_dict) # if grad_var_ and j % (log_interval*300) == 0: if grad_var_ and j % (log_interval*30) == 0: update_grad_plot(model_dict) to_print_legend_string += ' grad_var_plot updated ' make_plots(model_dict) print(to_print_legend_string + " Plot updated") # print (len(agent.replay_buffer)) except: raise #pass print(to_print_legend_string + " problem with plot") try: make_plots(model_dict) except: print ()
def viz(model_dict): def update_current_state(current_state, state, channels): # current_state: [processes, channels*stack, height, width] state = torch.from_numpy(state).float() # (processes, channels, height, width) # if num_stack > 1: #first stack*channel-channel frames = last stack*channel-channel , so slide them forward current_state[:, :-channels] = current_state[:, channels:] current_state[:, -channels:] = state #last frame is now the new one return current_state def update_rewards(reward, done, final_rewards, episode_rewards, current_state): # Reward, Done: [P], [P] # final_rewards, episode_rewards: [P,1]. [P,1] # current_state: [P,C*S,H,W] reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1] episode_rewards += reward #keeps track of current episode cumulative reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1] final_rewards *= masks #erase the ones that are done final_rewards += (1 - masks) * episode_rewards #set it to the cumulative episode reward episode_rewards *= masks #erase the done ones masks = masks.type(dtype) #cuda if current_state.dim() == 4: # if state is a frame/image current_state *= masks.unsqueeze(2).unsqueeze(2) #[P,1,1,1] else: current_state *= masks #restart the done ones, by setting the state to zero return reward, masks, final_rewards, episode_rewards, current_state def do_vid(): n_vids=3 for i in range(n_vids): done=False state = envs_video.reset() # state = torch.from_numpy(state).float().type(dtype) current_state = torch.zeros(1, *obs_shape) current_state = update_current_state(current_state, state, shape_dim0).type(dtype) # print ('Recording') # count=0 while not done: # print (count) # count +=1 # Act state_var = Variable(current_state, volatile=True) # print (state_var.size()) action, value = agent.act(state_var) cpu_actions = action.data.squeeze(1).cpu().numpy() # Observe reward and next state state, reward, done, info = envs_video.step(cpu_actions) # state:[nProcesss, ndims, height, width] # state = torch.from_numpy(state).float().type(dtype) # current_state = torch.zeros(1, *obs_shape) current_state = update_current_state(current_state, state, shape_dim0).type(dtype) state = envs_video.reset() vid_path = save_dir+'/videos/' count =0 for aaa in os.listdir(vid_path): if 'openaigym' in aaa and '.mp4' in aaa: #os.rename(vid_path+aaa, vid_path+'vid_t'+str(total_num_steps)+'.mp4') subprocess.call("(cd "+vid_path+" && mv "+ vid_path+aaa +" "+ vid_path+env_name+'_'+algo+'_vid_t'+str(total_num_steps)+'_'+str(count) +".mp4)", shell=True) count+=1 if '.json' in aaa: os.remove(vid_path+aaa) num_frames = model_dict['num_frames'] cuda = model_dict['cuda'] which_gpu = model_dict['which_gpu'] num_steps = model_dict['num_steps'] num_processes = model_dict['num_processes'] seed = model_dict['seed'] env_name = model_dict['env'] save_dir = model_dict['save_to'] num_stack = model_dict['num_stack'] algo = model_dict['algo'] save_interval = model_dict['save_interval'] log_interval = model_dict['log_interval'] os.environ['OMP_NUM_THREADS'] = '1' os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu) num_processes = 1 model_dict['num_processes'] = 1 if cuda: torch.cuda.manual_seed(seed) dtype = torch.cuda.FloatTensor else: torch.manual_seed(seed) dtype = torch.FloatTensor # Create environments print (num_processes, 'processes') # monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards') # if not os.path.exists(monitor_rewards_dir): # os.makedirs(monitor_rewards_dir) # print ('Made dir', monitor_rewards_dir) monitor_rewards_dir = '' envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)]) vid_ = 0 see_frames = 1 if vid_: print ('env for video') envs_video = make_env_monitor(env_name, save_dir) obs_shape = envs.observation_space.shape # (channels, height, width) obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]) # (channels*stack, height, width) shape_dim0 = envs.observation_space.shape[0] #channels model_dict['obs_shape']=obs_shape # Create agent if algo == 'a2c': agent = a2c(envs, model_dict) print ('init a2c agent') elif algo == 'ppo': agent = ppo(envs, model_dict) print ('init ppo agent') elif algo == 'a2c_minibatch': agent = a2c_minibatch(envs, model_dict) print ('init a2c_minibatch agent') # agent = model_dict['agent'](envs, model_dict) #Load model # if args.load_path != '': # agent.actor_critic = torch.load(os.path.join(args.load_path)) # epoch_level = 1e6 model_params_file = save_dir+ '/model_params/model_params'+str(int(epoch_level))+'.pt' agent.actor_critic = torch.load(model_params_file).cuda() print ('loaded ', model_params_file) # fafdas # frame_path = save_dir+'/frames/' if not os.path.exists(frame_path): os.makedirs(frame_path) print ('Made dir', frame_path) # Init state state = envs.reset() # (processes, channels, height, width) current_state = torch.zeros(num_processes, *obs_shape) # (processes, channels*stack, height, width) current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step # These are used to compute average rewards for all processes. episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward final_rewards = torch.zeros([num_processes, 1]) num_updates = int(num_frames) // num_steps // num_processes #Begin training count =0 start = time.time() for j in range(num_updates): for step in range(num_steps): # if see_frames: #Grayscale # save_frame(state, count) # #RGB # state = envs.render() # print(state.shape) # fdsafa values = [] actions = [] for ii in range(100): # Act, [P,1], [P,1] action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True)) val = value.data.cpu().numpy()[0][0] act_ = action.data.cpu().numpy()[0][0] # print ('value', val) # print ('action', act_) values.append(val) actions.append(act_) # print ('values', values) # print ('actions', actions) rows = 1 cols = 2 fig = plt.figure(figsize=(8,4), facecolor='white') # plot frame ax = plt.subplot2grid((rows,cols), (0,0), frameon=False) state1 = np.squeeze(state[0]) ax.imshow(state1, cmap='gray') ax.set_xticks([]) ax.set_yticks([]) # ax.savefig(frame_path+'frame' +str(count)+'.png') # print ('saved',frame_path+'frame' +str(count)+'.png') # plt.close(fig) #plot values histogram ax = plt.subplot2grid((rows,cols), (0,1), frameon=False) weights = np.ones_like(values)/float(len(values)) ax.hist(values, 50, range=[0.0, 4.], weights=weights) # ax.set_ylim(top=1.) ax.set_ylim([0.,1.]) plt_path = frame_path+'plt' plt.savefig(plt_path+str(count)+'.png') print ('saved',plt_path+str(count)+'.png') plt.close(fig) # fsadf count+=1 if count > 2: if done[0] or count > max_frames: ffsdfa # action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True)) # print ('value', value) # print ('action', action) # action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True)) # print ('value', value) # print ('action', action) cpu_actions = action.data.squeeze(1).cpu().numpy() #[P] # Step, S:[P,C,H,W], R:[P], D:[P] state, reward, done, info = envs.step(cpu_actions) # Record rewards reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state) # Update state current_state = update_current_state(current_state, state, shape_dim0) # Agent record step agent.insert_data(step, current_state, action.data, value.data, reward, masks) # #Optimize agent # agent.update() #agent.update(j,num_updates) # agent.insert_first_state(agent.rollouts.states[-1]) total_num_steps = (j + 1) * num_processes * num_steps