def __init__(self, args, achieved_trajectory_pool): self.args = args self.env = make_env(args) self.env_test = make_env(args) self.dim = np.prod(self.env.reset()['achieved_goal'].shape) self.delta = self.env.distance_threshold self.length = args.episodes init_goal = self.env.reset()['achieved_goal'].copy() self.pool = np.tile(init_goal[np.newaxis, :], [self.length, 1]) + np.random.normal( 0, self.delta, size=(self.length, self.dim)) self.init_state = self.env.reset()['observation'].copy() self.match_lib = gcc_load_lib('learner/cost_flow.c') self.achieved_trajectory_pool = achieved_trajectory_pool if self.args.graph: self.graph = args.graph # estimating diameter self.max_dis = 0 for i in range(1000): obs = self.env.reset() dis = self.get_graph_goal_distance(obs['achieved_goal'], obs['desired_goal']) if dis > self.max_dis: self.max_dis = dis
def experiment_setup(args): if args.vae_dist_help: load_vaes(args) #since some extensions of the envs use the distestimator this load is used with the interval wrapper#todo use other? load_field_parameters(args) if args.dist_estimator_type is not None: temp_env = make_temp_env(args) load_dist_estimator(args, temp_env) del temp_env env = make_env(args) env_test = make_env(args) if args.goal_based: args.obs_dims = list(goal_based_process(env.reset()).shape) args.acts_dims = [env.action_space.shape[0]] args.compute_reward = env.compute_reward args.compute_distance = env.compute_distance if args.imaginary_obstacle_transitions: #relative small buffer size so it always have most recent collisions args.imaginary_buffer = ReplayBuffer_Imaginary( args, buffer_size=args.im_buffer_size) args.buffer = buffer = ReplayBuffer_Episodic(args) args.learner = learner = create_learner(args) args.agent = agent = create_agent(args) args.logger.info('*** network initialization complete ***') args.tester = tester = Tester(args) args.logger.info('*** tester initialization complete ***') args.timesteps = env.env.env.spec.max_episode_steps return env, env_test, agent, buffer, learner, tester
def __init__(self, args, direct_playpath=None): # initialize environment self.args = args self.env = make_env(args) self.args.timesteps = self.env.env.env.spec.max_episode_steps self.env_test = make_env(args) self.info = [] self.test_rollouts = 100 # get current policy from path (restore tf session + graph) if direct_playpath is not None: self.play_dir = direct_playpath else: self.play_dir = args.play_path self.play_epoch = args.play_epoch self.meta_path = "{}saved_policy-{}.meta".format( self.play_dir, self.play_epoch) self.sess = tf.Session() self.saver = tf.train.import_meta_graph(self.meta_path) self.saver.restore(self.sess, tf.train.latest_checkpoint(self.play_dir)) graph = tf.get_default_graph() self.raw_obs_ph = graph.get_tensor_by_name("raw_obs_ph:0") self.pi = graph.get_tensor_by_name("main/policy/net/pi/Tanh:0") #for q values self.acts_ph = graph.get_tensor_by_name("acts_ph:0") self.q_pi = graph.get_tensor_by_name("main/value/net/q/BiasAdd:0")
def __init__(self, args): self.args = args self.env = make_env(args) self.env_test = make_env(args) self.env_List = [] for i in range(args.episodes): self.env_List.append(make_env(args)) self.achieved_trajectory_pool = TrajectoryPool(args, args.hgg_pool_size) self.sampler = MatchSampler(args, self.achieved_trajectory_pool) self.dynamic_buffer = dynamic_buffer(args.buffer_size * 100) # need fine tune here self.model_loss = [] try: with open("./random_trajs/trajs_0.pkl", 'rb') as f: self.hist = pickle.load(f) except IOError: print("Historical trajectory file not exist.") self.hist_trajs = {"eps": [], "obs": [], "goal": [], "action": []} self.current_trajs = {"eps": [], "obs": [], "goal": [], "action": []}
def __init__(self, args: HGGLearnerInput): self._args = args self.env = make_env(args) self.env_List = [make_env(args) for _ in range(args.episodes)] self.achieved_trajectory_pool = TrajectoryPool(args, args.hgg_pool_size) self.sampler = MatchSampler(args, self.achieved_trajectory_pool)
def __init__(self, args): self.args = args self.env = make_env(args) self.env_test = make_env(args) self.env_List = [] for i in range(args.episodes): self.env_List.append(make_env(args)) self.achieved_trajectory_pool = TrajectoryPool(args, args.hgg_pool_size) self.sampler = MatchSampler(args, self.achieved_trajectory_pool)
def __init__(self, args): self.args = args self.env = make_env(args) self.env_test = make_env(args) self.env_List = [] for i in range(args.episodes): self.env_List.append(make_env(args)) self.agent = create_agent(args) self.achieved_trajectory_pool = TrajectoryPool(args, args.hgg_pool_size) self.stop_hgg_threshold = self.args.stop_hgg_threshold self.stop = False self.learn_calls = 0
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) agent = VecEnvAgent(envs, args) agent.train_maml(num_updates)
def experiment_setup_test(args): if args.vae_dist_help: load_vaes(args) if args.vae_type == 'bbox': file_index_object = 'data/' + args.env + '/' + args.vae_type + '_obj_i.npy' file_indices_obstacle = 'data/' + args.env + '/' + args.vae_type + '_obstacles_indices.npy' args.obj_index = np.load(file_index_object) args.obstacles_indices = np.load(file_indices_obstacle) load_field_parameters(args) #if args.dist_estimator_type is not None: # temp_env = make_temp_env(args) # load_dist_estimator(args, temp_env) # del temp_env env = make_env(args) if args.goal_based: args.obs_dims = list(goal_based_process(env.reset()).shape) args.acts_dims = [env.action_space.shape[0]] args.compute_reward = env.compute_reward args.compute_distance = env.compute_distance from play import Player args.agent = agent = Player(args) #fix number of test rollouts to 500 args.tester = tester = Tester(args, test_rollouts=100, after_train_test=True) args.timesteps = env.env.env.spec.max_episode_steps return env, agent, tester
def evaluate(config_filepath: str, model_filepath: str, render: bool): conf = load_toml_config(config_filepath) env = make_env(conf.environment, render=render) agent = getattr(agents, conf.agent_type)(env.observation_space.shape[0], env.action_space.shape[0], **conf.agent) ckpt = torch.load(model_filepath, map_location='cpu') agent.load_state_dict(ckpt['agent']) o = env.reset() if render: env.render() done = False episode_reward = 0 t = 0 while not done: h = agent.select_action(o, eval=True) a = agent.post_process_action(o, h) o_next, r, done, _ = env.step(a) episode_reward += r o = o_next t += 1 print("STEPS: {}, REWARD: {}".format(t, episode_reward)) input("OK? >")
def main(): if not args.resume and os.path.isdir(args.save_path): print("the save path has already existed!") exit(0) setup_dirs(args) script_path = os.path.join(args.save_path, 'scripts') if not os.path.isdir(script_path): shutil.copytree('scripts', script_path) torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) env = None # placeholder if 'carla9' in args.env: # select CARLA v0.9.x as the platform env = create_carla9_env(args) elif 'carla8' in args.env: # select CARLA v0.8.x as the platform from envs.CARLA.carla_lib.carla.client import make_carla_client from envs.CARLA.carla_env import CarlaEnv client = make_carla_client('localhost', args.port, CARLA8_TIMEOUT) env = CarlaEnv(client, args) else: # select PyTorcs or GTAV as the platform # which is basically inherited from SPC, not fully supported in IPC env = make_env(args) if args.eval: from evaluate import evaluate_policy evaluate_policy(args, env) else: from train import train_policy train_policy(args, env)
def __init__(self,args): self.args = args self.device = torch.device('cuda') if args.cuda else torch.device('cpu') dummy_env = gym.make(self.args.env_name) self.actor = ACNet(dummy_env.action_space.n,args.feedforward) del dummy_env if args.load_dir is not None: actorState = torch.load(args.load_dir,map_location=lambda storage, loc: storage) if args.continue_training: self.actor.load_state_dict(actorState) print("Loaded pretrained model successfully") if args.transfer: self.actor.load_autoturn_model(actorState) if args.cuda: self.actor.cuda() self.actor_optimizer = optim.Adam(self.actor.parameters(),lr=self.args.lr) self.env_list = [make_env(self.args.env_name,self.args.seed,i) for i in range(self.args.num_processes)] if self.args.num_processes > 1: self.envs = gym_vecenv.SubprocVecEnv(self.env_list) else: self.envs = gym_vecenv.DummyVecEnv(self.env_list) if len(self.envs.observation_space.shape) == 1: self.envs = gym_vecenv.VecNormalize(self.envs) self.obs_shape = self.envs.observation_space.shape self.obs_shape = (self.obs_shape[0] * args.num_stack, *self.obs_shape[1:]) self.state_shape = 1 if args.feedforward else 256 self.rollouts = RolloutStorage(self.args.num_fwd_steps, self.args.num_processes, self.obs_shape, self.envs.action_space, self.state_shape) self.num_updates = int(args.num_frames)//args.num_fwd_steps//args.num_processes self.current_obs = torch.zeros(self.args.num_processes,*self.obs_shape) self.writer = SummaryWriter(log_dir=self.args.save_dir) self.fortress_threshold = 650 self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.actor_optimizer, mode='max',factor=0.2,patience=15,verbose=True,threshold=1e-3, threshold_mode='rel')
def __init__(self): # Create an instance of the network itself, as well as the memory. # Here is also a good place to set environmental parameters, # as well as training parameters - number of episodes / iterations, etc. args.log_dir = args.log_dir + args.env_name + '_' + args.algo try: os.makedirs(args.log_dir) except OSError: files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv')) for f in files: os.remove(f) envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) self.environment_name = args.env_name self.agent = VecEnvAgent(envs, args)
def eval_proc(file_name): print(file_name) f = open(os.path.join('./log_more', file_name), 'w+') types = ['RANDOM', 'RHCP', 'CDQN', 'MCT'] # for role_id in [2, 3, 1]: # for ta in types: # agent = make_agent(ta, role_id) # for i in range(1): # env = make_env('MCT') # st = StatCounter() # for j in tqdm(range(100)): # winning_rate = eval_episode(env, agent) # st.feed(winning_rate) # f.write('%s with role id %d against %s, winning rate: %f\n' % (ta, role_id, 'MCT', st.average)) for role_id in [2, 3, 1]: agent = make_agent('MCT', role_id) for i in range(1): for te in types: env = make_env(te) st = StatCounter() for j in tqdm(range(100)): winning_rate = eval_episode(env, agent) st.feed(winning_rate) f.write('%s with role id %d against %s, winning rate: %f\n' % ('MCT', role_id, te, st.average)) f.close()
def experiment_setup(args): env = make_env(args) env_test = make_env(args) if args.goal_based: args.obs_dims = list(goal_based_process(env.reset()).shape) args.acts_dims = [env.action_space.shape[0]] args.compute_reward = env.compute_reward args.compute_distance = env.compute_distance args.buffer = buffer = ReplayBuffer_Episodic(args) args.learner = learner = create_learner(args) args.agent = agent = create_agent(args) args.logger.info('*** network initialization complete ***') args.tester = tester = Tester(args) args.logger.info('*** tester initialization complete ***') return env, env_test, agent, buffer, learner, tester
def onlytest(): # env = make_env('RHCP') #env = make_env('RANDOM') #env = make_env('MCT') # env = make_env('CDQN') env = make_env('CDQN') agent = make_agent('RANDOM', 1) eval_episode(env, agent)
def __init__(self, args): # initialize environment self.args = args self.env = make_env(args) self.args.timesteps = self.env.max_episode_steps self.env_test = make_env(args) self.info = [] self.test_rollouts = 100 # get current policy from path (restore tf session + graph) self.play_dir = args.play_path self.play_epoch = args.play_epoch self.meta_path = os.path.join(self.play_dir, "saved_policy-{}.meta".format(self.play_epoch)) self.sess = tf.Session() self.saver = tf.train.import_meta_graph(self.meta_path) self.saver.restore(self.sess, tf.train.latest_checkpoint(self.play_dir)) graph = tf.get_default_graph() self.raw_obs_ph = graph.get_tensor_by_name("raw_obs_ph:0") self.pi = graph.get_tensor_by_name("main/policy/net/pi/Tanh:0")
def enjoy(): env = make_env(args.env_name, args.seed, 0, True) env = DummyVecEnv([env]) actor_critic, ob_rms = torch.load( os.path.join(save_path, args.env_name + ".pt")) render_func = env.envs[0].render obs_shape = env.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) current_obs = torch.zeros(1, *obs_shape) states = torch.zeros(1, actor_critic.state_size) masks = torch.zeros(1, 1) def update_current_obs(obs): shape_dim0 = env.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs render_func('human') obs = env.reset() update_current_obs(obs) while True: value, action, _, states = actor_critic.act(Variable(current_obs, volatile=True), Variable(states, volatile=True), Variable(masks, volatile=True), deterministic=True) states = states.data cpu_actions = action.data.squeeze(1).cpu().numpy() # Observation, reward and next obs obs, reward, done, _ = env.step(cpu_actions) time.sleep(0.05) masks.fill_(0.0 if done else 1.0) if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) renderer = render_func('human') if not renderer.window: sys.exit(0)
def __init__(self, args): self.args = args self.env = make_env(args) self.env_test = make_env(args) self.info = [] if args.save_acc: make_dir('log/accs', clear=False) self.test_rollouts = 100 self.env_List = [] self.env_test_List = [] for _ in range(self.test_rollouts): self.env_List.append(make_env(args)) self.env_test_List.append(make_env(args)) self.acc_record = {} self.acc_record[self.args.goal] = [] for key in self.acc_record.keys(): self.info.append('Success/' + key + '@blue')
def __init__(self, individuals): self.render = False self.envs = [ make_env("PommeFFAPartialFast-v0", 1, i, './tmp/gym/', False, False) for i in range(individuals) ] #self.envs = make_vec_envs("PommeFFAPartialFast-v0",1, individuals, 0.99, False, 1,'./tmp/gym/', False, torch.device("cpu"), allow_early_resets=False) #self.env = cloudpickle.dumps(make_env("PommeFFAPartialFast-v0")) #if individuals==1: # self.env = make_env("PommeFFAPartialFast-v0") #else: # self.env = [make_env("PommeFFAPartialFast-v0") for _ in range(individuals)] self.train = True
def experiment_setup(args): env = make_env(args) args.acts_dims = env.acts_dims args.obs_dims = env.obs_dims args.buffer = buffer = create_buffer(args) args.agent = agent = create_agent(args) args.agent_graph = agent.graph args.learner = learner = create_learner(args) args.logger.info('*** network initialization complete ***') args.tester = tester = Tester(args) args.logger.info('*** tester initialization complete ***') return env, agent, buffer, learner, tester
def __init__(self, args, test_rollouts=100, after_train_test=False): self.args = args self.env = make_env(args) self.env_test = make_env(args) self.info = [] self.calls = 0 self.after_train_test = after_train_test if args.save_acc: make_dir('log/accs', clear=False) self.test_rollouts = test_rollouts self.env_List = [] for _ in range(self.test_rollouts): self.env_List.append(make_env(args)) self.acc_record = {} self.acc_record[self.args.goal] = [] for key in self.acc_record.keys(): self.info.append('Success'+'@blue') self.info.append('MaxDistance') self.info.append('MinDistance') self.coll_tol = 0 #this attribute is just used for tests after training
def make_envs(progress, ob_rms): envs = [ make_env(changefun(env, progress), seed, i, log_dir, add_timestep) for i in range(num_processes) ] if num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs, gamma=gamma) if ob_rms is not None: envs.ob_rms = ob_rms return envs
def register_and_create_envs(id_tmp_dir, seed, environment, algorithm): """ Args: id_temp_dir (str): Working directory. All other args are automatically provided by sacred """ if environment['entry_point']: try: register(id=environment['name'], entry_point=environment['entry_point'], kwargs=environment['config'], max_episode_steps=environment['max_episode_steps']) except Exception: pass num_envs = algorithm['num_processes'] num_expert_envs = algorithm['num_expert_processes'] envs = [ make_env(environment['name'], seed, i, id_tmp_dir, occlusion=list(environment['occlusion']), sensor_noise=float(environment['sensor_noise'])) for i in range(num_envs - num_expert_envs) ] # Create "expert environments" which only replay trajectories from an expert database, rather than interacting with Gym. # See expert_envs.py for the exposed API. These environments are appended to the list of the Gym environments. if num_expert_envs: e_envs = [ make_expert_envs(environment['name'], seed, i, num_expert_envs, environment['expert_db']) for i in range(num_expert_envs) ] envs.extend(e_envs) # Vectorise envs if algorithm['num_processes'] > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) return envs
def evaluate_model_clipped(env, model, max_eval=20000, env_seed=2018, render=False, cuda=False): env = make_env(env, env_seed, 0, None)() if isinstance(model, CompressedModel): model = uncompress_model(model) if cuda: model.cuda() obs_shape = env.observation_space.shape obs_shape = (obs_shape[0] * 4, *obs_shape[1:]) current_obs = torch.zeros(1, *obs_shape) def update_current_obs(obs): shape_dim0 = env.observation_space.shape[0] obs = torch.from_numpy(obs).float() current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs if render: env.render() obs = env.reset() update_current_obs(obs) total_frames = 0 model.eval() total_reward = 0.0 for _ in range(max_eval): total_frames += 4 cur_state_var = Variable(current_obs) if cuda: cur_state_var = cur_state_var.cuda() values = model(cur_state_var)[0] if cuda: values = values.cpu() action = np.argmax(values.data.numpy()[:env.action_space.n]) new_state, reward, is_done, _ = step(env, action) total_reward += reward if is_done: break update_current_obs(new_state) if render: env.render() return total_reward, total_frames
def __init__(self, args): self.args = args self.env = make_env(args) self.info = [] if args.save_rews: make_dir('log/rews', clear=False) self.rews_record = {} self.rews_record[args.env] = [] if args.save_Q: make_dir('log/Q_std', clear=False) make_dir('log/Q_net', clear=False) make_dir('log/Q_ground', clear=False) self.Q_std_record, self.Q_net_record, self.Q_ground_record = {}, {}, {} self.Q_std_record[args.env], self.Q_net_record[ args.env], self.Q_ground_record[args.env] = [], [], [] self.info += ['Q_error/mean', 'Q_error/std']
def eval_proc(file_name): print(file_name) f = open(os.path.join('./log') + file_name, 'w+') for te in types: for ta in types: for role_id in [2, 3, 1]: agent = make_agent(ta, role_id) for i in range(1): env = make_env(te) st = StatCounter() with get_tqdm(total=100) as pbar: for j in range(100): winning_rate = eval_episode(env, agent) st.feed(winning_rate) pbar.update() f.write( '%s with role id %d against %s, winning rate: %f\n' % (ta, role_id, te, st.average)) f.close()
def sample(runs, iternum, root=ROOT, number=None): dirname = os.path.join(root, f"iter{iternum}/") os.makedirs(os.path.dirname(dirname), exist_ok=True) rollout = RolloutCollector(dirname) env = make_env() state_size = env.observation_space.shape action_size = [env.action_space.n] if hasattr(env.action_space, 'n') else env.action_space.shape agent = RandomAgent(state_size, action_size) if iternum <= 0 else ControlAgent(state_size, action_size, gpu=False, load=f"{env_name}/iter{iternum-1}") for ep in range(runs): state = env.reset() total_reward = 0 done = False while not done: env_action = agent.get_env_action(env, state)[0] state, reward, done, _ = env.step(env_action) rollout.step(env_action, state, reward, done, number) agent.train(state, env_action, state, reward, done) total_reward += reward print(f"Ep: {ep}, Reward: {total_reward}") env.close()
def main(): summary_dir = '/tmp/log' summary_dir = None render = '--visualise' in sys.argv[1:] prepare_process(summary_dir=summary_dir) if summary_dir is not None: summary_writer = tf.summary.FileWriter( '{}/learn.summary'.format(summary_dir)) else: summary_writer = None learn( env=envs.make_env(), render=render, save_path='/tmp/hra.sword', #restore_path='/tmp/hra.sword-10000', summary_writer=summary_writer)
def create_environment(environment, add_timestep, tasks, seeds, seed): num_tasks = len(tasks) print("Creating environments...") print("Environment name: {}".format(environment)) # env = make_env('TwoRooms-v2', seed=0, rank=0, add_timestep=False) envs = [ make_env(environment, seed, i, add_timestep) for i in range(num_processes_per_task * num_tasks) ] envs = MTSubprocVecEnv(envs) # TODO: Replace with info dict! constraint = [] start_constraint = [] for task in tasks: constraint += [task] * num_processes_per_task start_constraint += [False] * num_processes_per_task seeds = envs.draw_and_set_task(constraint=constraint, seed=seeds) return envs, constraint
def train(model_dict): def update_current_state(current_state, state, channels): # current_state: [processes, channels*stack, height, width] state = torch.from_numpy(state).float() # (processes, channels, height, width) # if num_stack > 1: #first stack*channel-channel frames = last stack*channel-channel , so slide them forward current_state[:, :-channels] = current_state[:, channels:] current_state[:, -channels:] = state #last frame is now the new one return current_state def update_rewards(reward, done, final_rewards, episode_rewards, current_state): # Reward, Done: [P], [P] # final_rewards, episode_rewards: [P,1]. [P,1] # current_state: [P,C*S,H,W] reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1] episode_rewards += reward #keeps track of current episode cumulative reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1] final_rewards *= masks #erase the ones that are done final_rewards += (1 - masks) * episode_rewards #set it to the cumulative episode reward episode_rewards *= masks #erase the done ones masks = masks.type(dtype) #cuda if current_state.dim() == 4: # if state is a frame/image current_state *= masks.unsqueeze(2).unsqueeze(2) #[P,1,1,1] else: current_state *= masks #restart the done ones, by setting the state to zero return reward, masks, final_rewards, episode_rewards, current_state num_frames = model_dict['num_frames'] cuda = model_dict['cuda'] which_gpu = model_dict['which_gpu'] num_steps = model_dict['num_steps'] num_processes = model_dict['num_processes'] seed = model_dict['seed'] env_name = model_dict['env'] save_dir = model_dict['save_to'] num_stack = model_dict['num_stack'] algo = model_dict['algo'] save_interval = model_dict['save_interval'] log_interval = model_dict['log_interval'] save_params = model_dict['save_params'] vid_ = model_dict['vid_'] gif_ = model_dict['gif_'] ls_ = model_dict['ls_'] os.environ['OMP_NUM_THREADS'] = '1' os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu) if cuda: torch.cuda.manual_seed(seed) dtype = torch.cuda.FloatTensor model_dict['dtype']=dtype else: torch.manual_seed(seed) dtype = torch.FloatTensor model_dict['dtype']=dtype # Create environments print (num_processes, 'processes') monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards') if not os.path.exists(monitor_rewards_dir): os.makedirs(monitor_rewards_dir) print ('Made dir', monitor_rewards_dir) envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)]) if vid_: print ('env for video') envs_video = make_env_monitor(env_name, save_dir) if gif_: print ('env for gif') envs_gif = make_env_basic(env_name) if ls_: print ('env for ls') envs_ls = make_env_basic(env_name) obs_shape = envs.observation_space.shape # (channels, height, width) obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]) # (channels*stack, height, width) shape_dim0 = envs.observation_space.shape[0] #channels model_dict['obs_shape']=obs_shape model_dict['shape_dim0']=shape_dim0 # # Create agent # if algo == 'a2c': # agent = a2c(envs, model_dict) # print ('init a2c agent') # elif algo == 'ppo': # agent = ppo(envs, model_dict) # print ('init ppo agent') # elif algo == 'a2c_minibatch': # agent = a2c_minibatch(envs, model_dict) # print ('init a2c_minibatch agent') # elif algo == 'a2c_list_rollout': # agent = a2c_list_rollout(envs, model_dict) # print ('init a2c_list_rollout agent') # elif algo == 'a2c_with_var': # agent = a2c_with_var(envs, model_dict) # print ('init a2c_with_var agent') # elif algo == 'a2c_bin_mask': # agent = a2c_with_var(envs, model_dict) # print ('init a2c_with_var agent') # agent = model_dict['agent'](envs, model_dict) # #Load model # if model_dict['load_params']: # # agent.actor_critic = torch.load(os.path.join(args.load_path)) # # agent.actor_critic = torch.load(args.load_path).cuda() # # print ('loaded ', args.load_path) # if model_dict['load_number'] == 3: # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 3000160, model_dict) # elif model_dict['load_number'] == 6: # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 6000160, model_dict) # elif model_dict['load_number'] == 9: # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 9000160, model_dict) # # else: # # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 8000160, model_dict) # else: # PROBLEM print ('Init expert agent') expert_agent = a2c(envs, model_dict) param_file = home+'/Documents/tmp/breakout_2frames_leakyrelu2/BreakoutNoFrameskip-v4/A2C/seed0/model_params3/model_params9999360.pt' param_dict = torch.load(param_file) expert_agent.actor_critic.load_state_dict(param_dict) print ('loaded params', param_file) expert_agent.actor_critic.cuda() print ('Init imitator agent') imitator_agent = a2c(envs, model_dict) # param_file = home+'/Documents/tmp/breakout_2frames_leakyrelu2/imitator_params.ckpt' # param_dict = torch.load(param_file) # imitator_agent.actor_critic.load_state_dict(param_dict) # print ('loaded params', param_file) imitator_agent.actor_critic.cuda() agent = expert_agent expert_policy = expert_agent.actor_critic imitator_policy = imitator_agent.actor_critic optimizer = optim.Adam(imitator_policy.parameters(), lr=.0005, weight_decay=.00001) total_steps = 0 display_step = 50 # Init state state = envs.reset() # (processes, channels, height, width) current_state = torch.zeros(num_processes, *obs_shape) # (processes, channels*stack, height, width) current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step # These are used to compute average rewards for all processes. episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward final_rewards = torch.zeros([num_processes, 1]) num_updates = int(num_frames) // num_steps // num_processes save_interval_num_updates = int(save_interval /num_processes/num_steps) #Begin training # count =0 start = time.time() start2 = time.time() for j in range(num_updates): for step in range(num_steps): # Act, [P,1], [P], [P,1], [P] # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True)) state__ = Variable(agent.rollouts.states[step]) / 255. value, action, action_log_probs, dist_entropy = agent.act(state__) #, requires_grad=False)#, volatile=True)) # print (action_log_probs.size()) # print (dist_entropy.size()) batch = state__ optimizer.zero_grad() log_dist_expert = expert_policy.action_logdist(batch) log_dist_imitator = imitator_policy.action_logdist(batch) action_dist_kl = torch.sum((log_dist_expert - log_dist_imitator)*torch.exp(log_dist_expert), dim=1) #[B] # elbo, logpx, logpz, logqz, action_dist_kl = self.forward(batch, policy, k=k) loss = torch.mean(action_dist_kl) loss.backward() # nn.utils.clip_grad_norm(self.parameters(), .5) optimizer.step() # if total_steps%display_step==0: # and batch_idx == 0: # # print ('Train Epoch: {}/{}'.format(epoch+1, epochs), # # 'total_epochs {}'.format(total_epochs), # print('LL:{:.4f}'.format(loss.data[0]) # # 'logpx:{:.4f}'.format(logpx.data[0]), # # 'logpz:{:.5f}'.format(logpz.data[0]), # # 'logqz:{:.5f}'.format(logqz.data[0]), # # 'action_kl:{:.4f}'.format(action_dist_kl.data[0]) # ) # total_steps+=1 cpu_actions = action.data.squeeze(1).cpu().numpy() #[P] # cpu_actions = action.data.cpu().numpy() #[P] # print (actions.size()) # Step, S:[P,C,H,W], R:[P], D:[P] state, reward, done, info = envs.step(cpu_actions) # Record rewards and update state reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state) current_state = update_current_state(current_state, state, shape_dim0) # Agent record step # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data) agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy) #, done) #Optimize agent agent.no_update() #agent.update(j,num_updates) # agent.update() #agent.update(j,num_updates) agent.insert_first_state(agent.rollouts.states[-1]) # print ('save_interval_num_updates', save_interval_num_updates) # print ('num_updates', num_updates) # print ('j', j) total_num_steps = (j + 1) * num_processes * num_steps # if total_num_steps % save_interval == 0 and save_dir != "": if j % save_interval_num_updates == 0 and save_dir != "" and j != 0: save_to = home+'/Documents/tmp/breakout_2frames_leakyrelu2/imitator_params_env.ckpt' torch.save(imitator_policy.state_dict(), save_to) print ('saved imitator_policy', save_to) # #Save model # if save_params: # do_params(save_dir, agent, total_num_steps, model_dict) # # save_params_v2(save_dir, agent, total_num_steps, model_dict) #make video if vid_: do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps) #make gif if gif_: do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps) #Print updates if j % log_interval == 0:# and j!=0: end = time.time() to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}, {:.4f}".format(j, total_num_steps, final_rewards.min(), final_rewards.median(), final_rewards.mean(), final_rewards.max(), int(total_num_steps / (end - start)), end - start, end - start2, loss.data[0]) # to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}".format(j, total_num_steps, # final_rewards.min(), # final_rewards.median(), # final_rewards.mean(), # final_rewards.max(), # int(total_num_steps / (end - start)), # end - start, # end - start2) print(to_print_info_string) start2 = time.time() to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, Time" if j % (log_interval*30) == 0: if ls_: do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards) # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated") # print(to_print_info_string + ' LS recorded')#, agent.current_lr) # else: #update plots try: if ls_: update_ls_plot(model_dict) make_plots(model_dict) print(to_print_legend_string + " Plot updated") except: raise #pass print(to_print_legend_string) try: make_plots(model_dict) except: print ()
def train(model_dict): def update_current_state(current_state, state, channels): # current_state: [processes, channels*stack, height, width] state = torch.from_numpy(state).float() # (processes, channels, height, width) # if num_stack > 1: #first stack*channel-channel frames = last stack*channel-channel , so slide them forward current_state[:, :-channels] = current_state[:, channels:] current_state[:, -channels:] = state #last frame is now the new one return current_state def update_rewards(reward, done, final_rewards, episode_rewards, current_state): # Reward, Done: [P], [P] # final_rewards, episode_rewards: [P,1]. [P,1] # current_state: [P,C*S,H,W] reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1] episode_rewards += reward #keeps track of current episode cumulative reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1] final_rewards *= masks #erase the ones that are done final_rewards += (1 - masks) * episode_rewards #set it to the cumulative episode reward episode_rewards *= masks #erase the done ones masks = masks.type(dtype) #cuda # print (current_state) # fdsf if current_state.dim() == 4: # if state is a frame/image current_state *= masks.unsqueeze(2).unsqueeze(2) #[P,1,1,1] else: current_state *= masks #restart the done ones, by setting the state to zero return reward, masks, final_rewards, episode_rewards, current_state num_frames = model_dict['num_frames'] cuda = model_dict['cuda'] which_gpu = model_dict['which_gpu'] num_steps = model_dict['num_steps'] num_processes = model_dict['num_processes'] seed = model_dict['seed'] env_name = model_dict['env'] save_dir = model_dict['save_to'] num_stack = model_dict['num_stack'] algo = model_dict['algo'] save_interval = model_dict['save_interval'] log_interval = model_dict['log_interval'] save_params = model_dict['save_params'] vid_ = model_dict['vid_'] gif_ = model_dict['gif_'] ls_ = model_dict['ls_'] os.environ['OMP_NUM_THREADS'] = '1' os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu) if cuda: torch.cuda.manual_seed(seed) dtype = torch.cuda.FloatTensor model_dict['dtype']=dtype else: torch.manual_seed(seed) dtype = torch.FloatTensor model_dict['dtype']=dtype # Create environments print (num_processes, 'processes') monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards') if not os.path.exists(monitor_rewards_dir): os.makedirs(monitor_rewards_dir) print ('Made dir', monitor_rewards_dir) envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)]) if vid_: print ('env for video') envs_video = make_env_monitor(env_name, save_dir) if gif_: print ('env for gif') envs_gif = make_env_basic(env_name) if ls_: print ('env for ls') envs_ls = make_env_basic(env_name) obs_shape = envs.observation_space.shape # (channels, height, width) obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]) # (channels*stack, height, width) shape_dim0 = envs.observation_space.shape[0] #channels model_dict['obs_shape']=obs_shape model_dict['shape_dim0']=shape_dim0 action_size = envs.action_space.n model_dict['action_size']=action_size # Create agent if algo == 'a2c': agent = a2c(model_dict) print ('init a2c agent') # elif algo == 'ppo': # agent = ppo(envs, model_dict) # print ('init ppo agent') # elif algo == 'a2c_minibatch': # agent = a2c_minibatch(envs, model_dict) # print ('init a2c_minibatch agent') # elif algo == 'a2c_list_rollout': # agent = a2c_list_rollout(envs, model_dict) # print ('init a2c_list_rollout agent') # elif algo == 'a2c_with_var': # agent = a2c_with_var(envs, model_dict) # print ('init a2c_with_var agent') # elif algo == 'a2c_bin_mask': # agent = a2c_with_var(envs, model_dict) # print ('init a2c_with_var agent') # agent = model_dict['agent'](envs, model_dict) #Load model if model_dict['load_params']: # agent.actor_critic = torch.load(os.path.join(args.load_path)) # agent.actor_critic = torch.load(args.load_path).cuda() # print ('loaded ', args.load_path) if model_dict['load_number'] == 3: load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 3000160, model_dict) elif model_dict['load_number'] == 6: load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 6000160, model_dict) elif model_dict['load_number'] == 9: load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 9000160, model_dict) # else: # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 8000160, model_dict) else: PROBLEM # Init state state = envs.reset() # (processes, channels, height, width) current_state = torch.zeros(num_processes, *obs_shape) # (processes, channels*stack, height, width) current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step # These are used to compute average rewards for all processes. episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward final_rewards = torch.zeros([num_processes, 1]) num_updates = int(num_frames) // num_steps // num_processes save_interval_num_updates = int(save_interval /num_processes/num_steps) #Begin training # count =0 start = time.time() start2 = time.time() for j in range(num_updates): for step in range(num_steps): # Act, [P,1], [P], [P,1], [P] # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True)) value, action, action_log_probs, dist_entropy = agent.act(Variable(agent.rollouts.states[step]/255.))#, volatile=True)) # print (action_log_probs.size()) # print (dist_entropy.size()) cpu_actions = action.data.squeeze(1).cpu().numpy() #[P] # cpu_actions = action.data.cpu().numpy() #[P] # print (actions.size()) # Step, S:[P,C,H,W], R:[P], D:[P] state, reward, done, info = envs.step(cpu_actions) # if np.sum(reward) > 0.: # print (reward) # afdas # Record rewards and update state reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state) current_state = update_current_state(current_state, state, shape_dim0) # Agent record step # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data) agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy) #, done) #Optimize agent agent.update() #agent.update(j,num_updates) agent.insert_first_state(agent.rollouts.states[-1]) # print ('save_interval_num_updates', save_interval_num_updates) # print ('num_updates', num_updates) # print ('j', j) total_num_steps = (j + 1) * num_processes * num_steps # if total_num_steps % save_interval == 0 and save_dir != "": if j % save_interval_num_updates == 0 and save_dir != "" and j != 0: #Save model if save_params: # do_params(save_dir, agent, total_num_steps, model_dict) # save_params_v2(save_dir, agent, total_num_steps, model_dict) save_params_v3(save_dir, agent, total_num_steps, model_dict) #make video if vid_: do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps) #make gif if gif_: do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps) #Print updates if j % log_interval == 0:# and j!=0: end = time.time() to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}".format(j, total_num_steps, final_rewards.min(), final_rewards.median(), final_rewards.mean(), final_rewards.max(), int(total_num_steps / (end - start)), end - start, end - start2) print(to_print_info_string) start2 = time.time() to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, Time" if j % (log_interval*30) == 0: if ls_: do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards) # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated") # print(to_print_info_string + ' LS recorded')#, agent.current_lr) # else: #update plots try: if ls_: update_ls_plot(model_dict) make_plots(model_dict) print(to_print_legend_string + " Plot updated") except: raise #pass print(to_print_legend_string) try: make_plots(model_dict) except: print ()
def train(model_dict): def update_current_state(current_state, state, channels): # current_state: [processes, channels*stack, height, width] state = torch.from_numpy(state).float() # (processes, channels, height, width) # if num_stack > 1: #first stack*channel-channel frames = last stack*channel-channel , so slide them forward current_state[:, :-channels] = current_state[:, channels:] current_state[:, -channels:] = state #last frame is now the new one return current_state def update_rewards(reward, done, final_rewards, episode_rewards, current_state): # Reward, Done: [P], [P] # final_rewards, episode_rewards: [P,1]. [P,1] # current_state: [P,C*S,H,W] reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1] episode_rewards += reward #keeps track of current episode cumulative reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1] final_rewards *= masks #erase the ones that are done final_rewards += (1 - masks) * episode_rewards #set it to the cumulative episode reward episode_rewards *= masks #erase the done ones masks = masks.type(dtype) #cuda if current_state.dim() == 4: # if state is a frame/image current_state *= masks.unsqueeze(2).unsqueeze(2) #[P,1,1,1] else: current_state *= masks #restart the done ones, by setting the state to zero return reward, masks, final_rewards, episode_rewards, current_state num_frames = model_dict['num_frames'] cuda = model_dict['cuda'] which_gpu = model_dict['which_gpu'] num_steps = model_dict['num_steps'] num_processes = model_dict['num_processes'] seed = model_dict['seed'] env_name = model_dict['env'] save_dir = model_dict['save_to'] num_stack = model_dict['num_stack'] algo = model_dict['algo'] save_interval = model_dict['save_interval'] log_interval = model_dict['log_interval'] save_params = model_dict['save_params'] vid_ = model_dict['vid_'] gif_ = model_dict['gif_'] ls_ = model_dict['ls_'] os.environ['OMP_NUM_THREADS'] = '1' os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu) if cuda: torch.cuda.manual_seed(seed) dtype = torch.cuda.FloatTensor model_dict['dtype']=dtype else: torch.manual_seed(seed) dtype = torch.FloatTensor model_dict['dtype']=dtype # Create environments print (num_processes, 'processes') monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards') if not os.path.exists(monitor_rewards_dir): os.makedirs(monitor_rewards_dir) print ('Made dir', monitor_rewards_dir) envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)]) if vid_: print ('env for video') envs_video = make_env_monitor(env_name, save_dir) if gif_: print ('env for gif') envs_gif = make_env_basic(env_name) if ls_: print ('env for ls') envs_ls = make_env_basic(env_name) obs_shape = envs.observation_space.shape # (channels, height, width) obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]) # (channels*stack, height, width) shape_dim0 = envs.observation_space.shape[0] #channels model_dict['obs_shape']=obs_shape model_dict['shape_dim0']=shape_dim0 # Create agent if algo == 'a2c': agent = a2c(envs, model_dict) print ('init a2c agent') # elif algo == 'ppo': # agent = ppo(envs, model_dict) # print ('init ppo agent') # elif algo == 'a2c_minibatch': # agent = a2c_minibatch(envs, model_dict) # print ('init a2c_minibatch agent') # elif algo == 'a2c_list_rollout': # agent = a2c_list_rollout(envs, model_dict) # print ('init a2c_list_rollout agent') # elif algo == 'a2c_with_var': # agent = a2c_with_var(envs, model_dict) # print ('init a2c_with_var agent') # elif algo == 'a2c_bin_mask': # agent = a2c_with_var(envs, model_dict) # print ('init a2c_with_var agent') # agent = model_dict['agent'](envs, model_dict) # #Load model # if model_dict['load_params']: # # agent.actor_critic = torch.load(os.path.join(args.load_path)) # # agent.actor_critic = torch.load(args.load_path).cuda() # # print ('loaded ', args.load_path) # if model_dict['load_number'] == 3: # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 3000160, model_dict) # elif model_dict['load_number'] == 6: # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 6000160, model_dict) # elif model_dict['load_number'] == 9: # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 9000160, model_dict) # # else: # # load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 8000160, model_dict) # else: # PROBLEM #load model # if model_dict['load_params']: # load_params(thigns) # param_file = home+'/Documents/tmp/breakout_2frames/BreakoutNoFrameskip-v4/A2C/seed0/model_params/model_params9999360.pt' param_file = home+'/Documents/tmp/RoadRunner/RoadRunnerNoFrameskip-v4/A2C/seed1/model_params3/model_params9999360.pt' # pretrained_dict = torch.load(param_file) # object # print (pretrained_dict) # agent_dict = agent.actor_critic.state_dict() #dict # print (agent_dict.keys()) # agent_dict.update(pretrained_dict) # # agent_dict.update(agent.actor_critic) # agent.actor_critic.load_state_dict(agent_dict) param_dict = torch.load(param_file) agent.actor_critic.load_state_dict(param_dict) # agent.actor_critic = torch.load(param_file) agent.actor_critic.cuda() print ('loaded', param_file) # afdsa # Init state state = envs.reset() # (processes, channels, height, width) current_state = torch.zeros(num_processes, *obs_shape) # (processes, channels*stack, height, width) current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step # These are used to compute average rewards for all processes. episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward final_rewards = torch.zeros([num_processes, 1]) num_updates = int(num_frames) // num_steps // num_processes save_interval_num_updates = int(save_interval /num_processes/num_steps) # list of lists, where lists are trajectories. trajectories have actinos and states dataset = [] tmp_trajs = [[] for x in range(num_processes)] dataset_count = 0 done = [0]*num_processes #Begin training # count =0 start = time.time() start2 = time.time() for j in range(num_updates): for step in range(num_steps): # Act, [P,1], [P], [P,1], [P] # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True)) value, action, action_log_probs, dist_entropy = agent.act(Variable(agent.rollouts.states[step]))#, volatile=True)) # print (action_log_probs.size()) # print (dist_entropy.size()) cpu_actions = action.data.squeeze(1).cpu().numpy() #[P] # cpu_actions = action.data.cpu().numpy() #[P] # print (actions.size()) # y = torch.LongTensor(batch_size,1).random_() % nb_digits # # One hot encoding buffer that you create out of the loop and just keep reusing # y_onehot = torch.FloatTensor(batch_size, nb_digits) # # In your for loop # y_onehot.zero_() # y_onehot.scatter_(1, y, 1) states_ = agent.rollouts.states[step].cpu().numpy() #[P,S,84,84] # print (state_t.shape) actions_ = action.data.cpu().numpy() #[P,1] # print (action) # fdsaf #store step for proc in range(num_processes): #add states state_t = states_[proc] action_t = actions_[proc] tmp_trajs[proc].append([action_t, state_t]) if done[proc]: dataset.append(tmp_trajs[proc]) dataset_count += len(tmp_trajs[proc]) tmp_trajs[proc] = [] for ii in range(len(dataset)): print (len(dataset[ii])) if dataset_count > 10000: # pickle.dump( dataset, open(home+'/Documents/tmp/breakout_2frames/breakout_trajectories_10000.pkl', "wb" ) ) pickle.dump( dataset, open(home+'/Documents/tmp/RoadRunner/trajectories_10000.pkl', "wb" ) ) print('saved') # pickle.save(dataset) STOP # Step, S:[P,C,H,W], R:[P], D:[P] state, reward, done, info = envs.step(cpu_actions) # Record rewards and update state reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state) current_state = update_current_state(current_state, state, shape_dim0) # Agent record step # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data) agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy) #, done) # print (len(dataset)) # print () #Optimize agent # agent.update() #agent.update(j,num_updates) agent.insert_first_state(agent.rollouts.states[-1]) # print ('save_interval_num_updates', save_interval_num_updates) # print ('num_updates', num_updates) # print ('j', j) total_num_steps = (j + 1) * num_processes * num_steps # if total_num_steps % save_interval == 0 and save_dir != "": if j % save_interval_num_updates == 0 and save_dir != "" and j != 0: #Save model if save_params: do_params(save_dir, agent, total_num_steps, model_dict) # save_params_v2(save_dir, agent, total_num_steps, model_dict) #make video if vid_: do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps) #make gif if gif_: do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps) #Print updates if j % log_interval == 0:# and j!=0: end = time.time() to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}".format(j, total_num_steps, final_rewards.min(), final_rewards.median(), final_rewards.mean(), final_rewards.max(), int(total_num_steps / (end - start)), end - start, end - start2) print(to_print_info_string) start2 = time.time() to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, Time" if j % (log_interval*30) == 0: if ls_: do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards) # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated") # print(to_print_info_string + ' LS recorded')#, agent.current_lr) # else: #update plots try: if ls_: update_ls_plot(model_dict) make_plots(model_dict) print(to_print_legend_string + " Plot updated") except: raise #pass print(to_print_legend_string) try: make_plots(model_dict) except: print ()
def viz(model_dict): def update_current_state(current_state, state, channels): # current_state: [processes, channels*stack, height, width] state = torch.from_numpy(state).float() # (processes, channels, height, width) # if num_stack > 1: #first stack*channel-channel frames = last stack*channel-channel , so slide them forward current_state[:, :-channels] = current_state[:, channels:] current_state[:, -channels:] = state #last frame is now the new one return current_state def update_rewards(reward, done, final_rewards, episode_rewards, current_state): # Reward, Done: [P], [P] # final_rewards, episode_rewards: [P,1]. [P,1] # current_state: [P,C*S,H,W] reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1] episode_rewards += reward #keeps track of current episode cumulative reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1] final_rewards *= masks #erase the ones that are done final_rewards += (1 - masks) * episode_rewards #set it to the cumulative episode reward episode_rewards *= masks #erase the done ones masks = masks.type(dtype) #cuda if current_state.dim() == 4: # if state is a frame/image current_state *= masks.unsqueeze(2).unsqueeze(2) #[P,1,1,1] else: current_state *= masks #restart the done ones, by setting the state to zero return reward, masks, final_rewards, episode_rewards, current_state def do_vid(): n_vids=3 for i in range(n_vids): done=False state = envs_video.reset() # state = torch.from_numpy(state).float().type(dtype) current_state = torch.zeros(1, *obs_shape) current_state = update_current_state(current_state, state, shape_dim0).type(dtype) # print ('Recording') # count=0 while not done: # print (count) # count +=1 # Act state_var = Variable(current_state, volatile=True) # print (state_var.size()) action, value = agent.act(state_var) cpu_actions = action.data.squeeze(1).cpu().numpy() # Observe reward and next state state, reward, done, info = envs_video.step(cpu_actions) # state:[nProcesss, ndims, height, width] # state = torch.from_numpy(state).float().type(dtype) # current_state = torch.zeros(1, *obs_shape) current_state = update_current_state(current_state, state, shape_dim0).type(dtype) state = envs_video.reset() vid_path = save_dir+'/videos/' count =0 for aaa in os.listdir(vid_path): if 'openaigym' in aaa and '.mp4' in aaa: #os.rename(vid_path+aaa, vid_path+'vid_t'+str(total_num_steps)+'.mp4') subprocess.call("(cd "+vid_path+" && mv "+ vid_path+aaa +" "+ vid_path+env_name+'_'+algo+'_vid_t'+str(total_num_steps)+'_'+str(count) +".mp4)", shell=True) count+=1 if '.json' in aaa: os.remove(vid_path+aaa) num_frames = model_dict['num_frames'] cuda = model_dict['cuda'] which_gpu = model_dict['which_gpu'] num_steps = model_dict['num_steps'] num_processes = model_dict['num_processes'] seed = model_dict['seed'] env_name = model_dict['env'] save_dir = model_dict['save_to'] num_stack = model_dict['num_stack'] algo = model_dict['algo'] save_interval = model_dict['save_interval'] log_interval = model_dict['log_interval'] os.environ['OMP_NUM_THREADS'] = '1' os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu) num_processes = 1 model_dict['num_processes'] = 1 if cuda: torch.cuda.manual_seed(seed) dtype = torch.cuda.FloatTensor else: torch.manual_seed(seed) dtype = torch.FloatTensor # Create environments print (num_processes, 'processes') # monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards') # if not os.path.exists(monitor_rewards_dir): # os.makedirs(monitor_rewards_dir) # print ('Made dir', monitor_rewards_dir) monitor_rewards_dir = '' envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)]) vid_ = 0 see_frames = 1 if vid_: print ('env for video') envs_video = make_env_monitor(env_name, save_dir) obs_shape = envs.observation_space.shape # (channels, height, width) obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]) # (channels*stack, height, width) shape_dim0 = envs.observation_space.shape[0] #channels model_dict['obs_shape']=obs_shape # Create agent if algo == 'a2c': agent = a2c(envs, model_dict) print ('init a2c agent') elif algo == 'ppo': agent = ppo(envs, model_dict) print ('init ppo agent') elif algo == 'a2c_minibatch': agent = a2c_minibatch(envs, model_dict) print ('init a2c_minibatch agent') # agent = model_dict['agent'](envs, model_dict) #Load model # if args.load_path != '': # agent.actor_critic = torch.load(os.path.join(args.load_path)) # epoch_level = 1e6 model_params_file = save_dir+ '/model_params/model_params'+str(int(epoch_level))+'.pt' agent.actor_critic = torch.load(model_params_file).cuda() print ('loaded ', model_params_file) # fafdas # frame_path = save_dir+'/frames/' if not os.path.exists(frame_path): os.makedirs(frame_path) print ('Made dir', frame_path) # Init state state = envs.reset() # (processes, channels, height, width) current_state = torch.zeros(num_processes, *obs_shape) # (processes, channels*stack, height, width) current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step # These are used to compute average rewards for all processes. episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward final_rewards = torch.zeros([num_processes, 1]) num_updates = int(num_frames) // num_steps // num_processes #Begin training count =0 start = time.time() for j in range(num_updates): for step in range(num_steps): # if see_frames: #Grayscale # save_frame(state, count) # #RGB # state = envs.render() # print(state.shape) # fdsafa values = [] actions = [] for ii in range(100): # Act, [P,1], [P,1] action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True)) val = value.data.cpu().numpy()[0][0] act_ = action.data.cpu().numpy()[0][0] # print ('value', val) # print ('action', act_) values.append(val) actions.append(act_) # print ('values', values) # print ('actions', actions) rows = 1 cols = 2 fig = plt.figure(figsize=(8,4), facecolor='white') # plot frame ax = plt.subplot2grid((rows,cols), (0,0), frameon=False) state1 = np.squeeze(state[0]) ax.imshow(state1, cmap='gray') ax.set_xticks([]) ax.set_yticks([]) # ax.savefig(frame_path+'frame' +str(count)+'.png') # print ('saved',frame_path+'frame' +str(count)+'.png') # plt.close(fig) #plot values histogram ax = plt.subplot2grid((rows,cols), (0,1), frameon=False) weights = np.ones_like(values)/float(len(values)) ax.hist(values, 50, range=[0.0, 4.], weights=weights) # ax.set_ylim(top=1.) ax.set_ylim([0.,1.]) plt_path = frame_path+'plt' plt.savefig(plt_path+str(count)+'.png') print ('saved',plt_path+str(count)+'.png') plt.close(fig) # fsadf count+=1 if count > 2: if done[0] or count > max_frames: ffsdfa # action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True)) # print ('value', value) # print ('action', action) # action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True)) # print ('value', value) # print ('action', action) cpu_actions = action.data.squeeze(1).cpu().numpy() #[P] # Step, S:[P,C,H,W], R:[P], D:[P] state, reward, done, info = envs.step(cpu_actions) # Record rewards reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state) # Update state current_state = update_current_state(current_state, state, shape_dim0) # Agent record step agent.insert_data(step, current_state, action.data, value.data, reward, masks) # #Optimize agent # agent.update() #agent.update(j,num_updates) # agent.insert_first_state(agent.rollouts.states[-1]) total_num_steps = (j + 1) * num_processes * num_steps
def viz(model_dict): def update_current_state(current_state, state, channels): # current_state: [processes, channels*stack, height, width] state = torch.from_numpy(state).float() # (processes, channels, height, width) # if num_stack > 1: #first stack*channel-channel frames = last stack*channel-channel , so slide them forward current_state[:, :-channels] = current_state[:, channels:] current_state[:, -channels:] = state #last frame is now the new one return current_state def update_rewards(reward, done, final_rewards, episode_rewards, current_state): # Reward, Done: [P], [P] # final_rewards, episode_rewards: [P,1]. [P,1] # current_state: [P,C*S,H,W] reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1] episode_rewards += reward #keeps track of current episode cumulative reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1] final_rewards *= masks #erase the ones that are done final_rewards += (1 - masks) * episode_rewards #set it to the cumulative episode reward episode_rewards *= masks #erase the done ones masks = masks.type(dtype) #cuda if current_state.dim() == 4: # if state is a frame/image current_state *= masks.unsqueeze(2).unsqueeze(2) #[P,1,1,1] else: current_state *= masks #restart the done ones, by setting the state to zero return reward, masks, final_rewards, episode_rewards, current_state num_frames = model_dict['num_frames'] cuda = model_dict['cuda'] which_gpu = model_dict['which_gpu'] num_steps = model_dict['num_steps'] num_processes = model_dict['num_processes'] seed = model_dict['seed'] env_name = model_dict['env'] save_dir = model_dict['save_to'] num_stack = model_dict['num_stack'] algo = model_dict['algo'] save_interval = model_dict['save_interval'] log_interval = model_dict['log_interval'] os.environ['OMP_NUM_THREADS'] = '1' os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu) num_processes = 1 model_dict['num_processes'] = 1 model_dict['num_steps'] = max_frames num_steps = max_frames if cuda: torch.cuda.manual_seed(seed) dtype = torch.cuda.FloatTensor else: torch.manual_seed(seed) dtype = torch.FloatTensor # Create environments print (num_processes, 'processes') monitor_rewards_dir = '' envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)]) vid_ = 0 see_frames = 1 if vid_: print ('env for video') envs_video = make_env_monitor(env_name, save_dir) obs_shape = envs.observation_space.shape # (channels, height, width) obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]) # (channels*stack, height, width) shape_dim0 = envs.observation_space.shape[0] #channels model_dict['obs_shape']=obs_shape # Create agent if algo == 'a2c': agent = a2c(envs, model_dict) print ('init a2c agent') elif algo == 'ppo': agent = ppo(envs, model_dict) print ('init ppo agent') elif algo == 'a2c_minibatch': agent = a2c_minibatch(envs, model_dict) print ('init a2c_minibatch agent') # agent = model_dict['agent'](envs, model_dict) #Load model model_params_file = save_dir+ '/model_params/model_params'+str(int(epoch_level))+'.pt' agent.actor_critic = torch.load(model_params_file).cuda() print ('loaded ', model_params_file) # fafdas # frame_path = save_dir+'/frames/' if not os.path.exists(frame_path): os.makedirs(frame_path) print ('Made dir', frame_path) # Init state state = envs.reset() # (processes, channels, height, width) current_state = torch.zeros(num_processes, *obs_shape) # (processes, channels*stack, height, width) current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step # These are used to compute average rewards for all processes. episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward final_rewards = torch.zeros([num_processes, 1]) num_updates = int(num_frames) // num_steps // num_processes #Begin training count =0 start = time.time() for j in range(num_updates): for step in range(num_steps): # if see_frames: #Grayscale # save_frame(state, count) # #RGB # state = envs.render() # print(state.shape) # fdsafa # def get_action_meanings(self): # return [ACTION_MEANING[i] for i in self._action_set] # print (envs.get_action_meanings()) # print (agent.rollouts.states[step].size()) # print ('values', values) # print ('actions', actions) # rows = 1 # cols = 3 # fig = plt.figure(figsize=(8,4), facecolor='white') # # plot frame # ax = plt.subplot2grid((rows,cols), (0,0), frameon=False) # state1 = np.squeeze(state[0]) # ax.imshow(state1, cmap='gray') # ax.set_xticks([]) # ax.set_yticks([]) # # ax.savefig(frame_path+'frame' +str(count)+'.png') # # print ('saved',frame_path+'frame' +str(count)+'.png') # # plt.close(fig) # ax.set_title('State',family='serif') # #plot values histogram # ax = plt.subplot2grid((rows,cols), (0,2), frameon=False) # values = [] # actions = [] # for ii in range(100): # # Act, [P,1], [P,1] # action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True)) # val = value.data.cpu().numpy()[0][0] # act_ = action.data.cpu().numpy()[0][0] # # print ('value', val) # # print ('action', act_) # values.append(val) # actions.append(act_) # weights = np.ones_like(values)/float(len(values)) # ax.hist(values, 50, range=[0.0, 4.], weights=weights) # # ax.set_ylim(top=1.) # ax.set_ylim([0.,1.]) # ax.set_title('Value',family='serif') # #plot actions # ax = plt.subplot2grid((rows,cols), (0,1), frameon=False) # action_prob = agent.actor_critic.action_dist(Variable(agent.rollouts.states[step], volatile=True)) # action_prob = np.squeeze(action_prob.data.cpu().numpy()) # action_size = envs.action_space.n # # print (action_prob.shape) # ax.bar(range(action_size), action_prob) # ax.set_title('Action',family='serif') # # ax.set_xticklabels(['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']) # plt.xticks(range(action_size),['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'R_FIRE', 'L_FIRE'], fontsize=6) # ax.set_ylim([0.,1.]) # # print (action_prob) # # ['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE'] # # fdsfas # plt.tight_layout(pad=3., w_pad=2.5, h_pad=1.0) # plt_path = frame_path+'plt' # plt.savefig(plt_path+str(count)+'.png') # print ('saved',plt_path+str(count)+'.png') # plt.close(fig) # # fsadf count+=1 if count % 10 ==0: print (count) if count > 2: if reward.cpu().numpy() > 0: # print (, reward.cpu().numpy(), count) print (done[0],masks.cpu().numpy(), reward.cpu().numpy(),'reward!!', step) print (np.squeeze(agent.rollouts.rewards.cpu().numpy())) else: print (done[0],masks.cpu().numpy(), reward.cpu().numpy()) # if done[0] or count > max_frames: if count > max_frames: next_value = agent.actor_critic(Variable(agent.rollouts.states[-1], volatile=True))[0].data agent.rollouts.compute_returns(next_value, agent.use_gae, agent.gamma, agent.tau) rollouts_ = np.squeeze(agent.rollouts.returns.cpu().numpy()) rewards_ = np.squeeze(agent.rollouts.rewards.cpu().numpy()) # rollouts_ = np.squeeze(agent.rollouts.returns.cpu().numpy()) # rollouts_ = np.squeeze(agent.rollouts.returns.cpu().numpy()) for jj in range(len(rollouts_)): print (jj, rollouts_[jj], rewards_[jj]) ffsdfa # action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True)) # print ('value', value) # print ('action', action) # action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True)) # print ('value', value) # print ('action', action) action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() #[P] # Step, S:[P,C,H,W], R:[P], D:[P] state, reward, done, info = envs.step(cpu_actions) # Record rewards reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state) # Update state current_state = update_current_state(current_state, state, shape_dim0) # Agent record step agent.insert_data(step, current_state, action.data, value.data, reward, masks) # print (reward) total_num_steps = (j + 1) * num_processes * num_steps