Beispiel #1
0
    def __init__(self, args, achieved_trajectory_pool):
        self.args = args
        self.env = make_env(args)
        self.env_test = make_env(args)
        self.dim = np.prod(self.env.reset()['achieved_goal'].shape)
        self.delta = self.env.distance_threshold

        self.length = args.episodes
        init_goal = self.env.reset()['achieved_goal'].copy()
        self.pool = np.tile(init_goal[np.newaxis, :],
                            [self.length, 1]) + np.random.normal(
                                0, self.delta, size=(self.length, self.dim))
        self.init_state = self.env.reset()['observation'].copy()

        self.match_lib = gcc_load_lib('learner/cost_flow.c')
        self.achieved_trajectory_pool = achieved_trajectory_pool

        if self.args.graph:
            self.graph = args.graph

        # estimating diameter
        self.max_dis = 0
        for i in range(1000):
            obs = self.env.reset()
            dis = self.get_graph_goal_distance(obs['achieved_goal'],
                                               obs['desired_goal'])
            if dis > self.max_dis: self.max_dis = dis
Beispiel #2
0
def experiment_setup(args):
    if args.vae_dist_help:
        load_vaes(args)

    #since some extensions of the envs use the distestimator this load is used with the interval wrapper#todo use other?
    load_field_parameters(args)
    if args.dist_estimator_type is not None:
        temp_env = make_temp_env(args)
        load_dist_estimator(args, temp_env)
        del temp_env

    env = make_env(args)
    env_test = make_env(args)

    if args.goal_based:

        args.obs_dims = list(goal_based_process(env.reset()).shape)
        args.acts_dims = [env.action_space.shape[0]]
        args.compute_reward = env.compute_reward
        args.compute_distance = env.compute_distance

    if args.imaginary_obstacle_transitions:
        #relative small buffer size so it always have most recent collisions
        args.imaginary_buffer = ReplayBuffer_Imaginary(
            args, buffer_size=args.im_buffer_size)
    args.buffer = buffer = ReplayBuffer_Episodic(args)
    args.learner = learner = create_learner(args)
    args.agent = agent = create_agent(args)
    args.logger.info('*** network initialization complete ***')
    args.tester = tester = Tester(args)
    args.logger.info('*** tester initialization complete ***')
    args.timesteps = env.env.env.spec.max_episode_steps

    return env, env_test, agent, buffer, learner, tester
Beispiel #3
0
    def __init__(self, args, direct_playpath=None):
        # initialize environment
        self.args = args
        self.env = make_env(args)
        self.args.timesteps = self.env.env.env.spec.max_episode_steps
        self.env_test = make_env(args)
        self.info = []
        self.test_rollouts = 100

        # get current policy from path (restore tf session + graph)
        if direct_playpath is not None:
            self.play_dir = direct_playpath
        else:
            self.play_dir = args.play_path
        self.play_epoch = args.play_epoch
        self.meta_path = "{}saved_policy-{}.meta".format(
            self.play_dir, self.play_epoch)
        self.sess = tf.Session()
        self.saver = tf.train.import_meta_graph(self.meta_path)
        self.saver.restore(self.sess,
                           tf.train.latest_checkpoint(self.play_dir))
        graph = tf.get_default_graph()
        self.raw_obs_ph = graph.get_tensor_by_name("raw_obs_ph:0")
        self.pi = graph.get_tensor_by_name("main/policy/net/pi/Tanh:0")
        #for q values
        self.acts_ph = graph.get_tensor_by_name("acts_ph:0")
        self.q_pi = graph.get_tensor_by_name("main/value/net/q/BiasAdd:0")
Beispiel #4
0
    def __init__(self, args):
        self.args = args
        self.env = make_env(args)
        self.env_test = make_env(args)

        self.env_List = []
        for i in range(args.episodes):
            self.env_List.append(make_env(args))

        self.achieved_trajectory_pool = TrajectoryPool(args,
                                                       args.hgg_pool_size)
        self.sampler = MatchSampler(args, self.achieved_trajectory_pool)

        self.dynamic_buffer = dynamic_buffer(args.buffer_size *
                                             100)  # need fine tune here
        self.model_loss = []

        try:
            with open("./random_trajs/trajs_0.pkl", 'rb') as f:
                self.hist = pickle.load(f)
        except IOError:
            print("Historical trajectory file not exist.")

        self.hist_trajs = {"eps": [], "obs": [], "goal": [], "action": []}
        self.current_trajs = {"eps": [], "obs": [], "goal": [], "action": []}
    def __init__(self, args: HGGLearnerInput):
        self._args = args
        self.env = make_env(args)

        self.env_List = [make_env(args) for _ in range(args.episodes)]
        self.achieved_trajectory_pool = TrajectoryPool(args,
                                                       args.hgg_pool_size)
        self.sampler = MatchSampler(args, self.achieved_trajectory_pool)
    def __init__(self, args):
        self.args = args
        self.env = make_env(args)
        self.env_test = make_env(args)

        self.env_List = []
        for i in range(args.episodes):
            self.env_List.append(make_env(args))

        self.achieved_trajectory_pool = TrajectoryPool(args,
                                                       args.hgg_pool_size)
        self.sampler = MatchSampler(args, self.achieved_trajectory_pool)
 def __init__(self, args):
     self.args = args
     self.env = make_env(args)
     self.env_test = make_env(args)
     self.env_List = []
     for i in range(args.episodes):
         self.env_List.append(make_env(args))
     self.agent = create_agent(args)
     self.achieved_trajectory_pool = TrajectoryPool(args, args.hgg_pool_size)
     self.stop_hgg_threshold = self.args.stop_hgg_threshold
     self.stop = False
     self.learn_calls = 0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    agent = VecEnvAgent(envs, args)
    agent.train_maml(num_updates)
Beispiel #9
0
def experiment_setup_test(args):

    if args.vae_dist_help:
        load_vaes(args)
        if args.vae_type == 'bbox':
            file_index_object = 'data/' + args.env + '/' + args.vae_type + '_obj_i.npy'
            file_indices_obstacle = 'data/' + args.env + '/' + args.vae_type + '_obstacles_indices.npy'
            args.obj_index = np.load(file_index_object)
            args.obstacles_indices = np.load(file_indices_obstacle)

    load_field_parameters(args)
    #if args.dist_estimator_type is not None:
    #	temp_env = make_temp_env(args)
    #	load_dist_estimator(args, temp_env)
    #	del temp_env
    env = make_env(args)

    if args.goal_based:
        args.obs_dims = list(goal_based_process(env.reset()).shape)
        args.acts_dims = [env.action_space.shape[0]]
        args.compute_reward = env.compute_reward
        args.compute_distance = env.compute_distance

    from play import Player
    args.agent = agent = Player(args)
    #fix number of test rollouts to 500
    args.tester = tester = Tester(args,
                                  test_rollouts=100,
                                  after_train_test=True)
    args.timesteps = env.env.env.spec.max_episode_steps

    return env, agent, tester
Beispiel #10
0
def evaluate(config_filepath: str, model_filepath: str, render: bool):
    conf = load_toml_config(config_filepath)
    env = make_env(conf.environment, render=render)
    agent = getattr(agents,
                    conf.agent_type)(env.observation_space.shape[0],
                                     env.action_space.shape[0], **conf.agent)
    ckpt = torch.load(model_filepath, map_location='cpu')
    agent.load_state_dict(ckpt['agent'])

    o = env.reset()
    if render:
        env.render()
    done = False
    episode_reward = 0
    t = 0
    while not done:
        h = agent.select_action(o, eval=True)
        a = agent.post_process_action(o, h)
        o_next, r, done, _ = env.step(a)
        episode_reward += r
        o = o_next
        t += 1

    print("STEPS: {}, REWARD: {}".format(t, episode_reward))
    input("OK? >")
Beispiel #11
0
def main():
    if not args.resume and os.path.isdir(args.save_path):
        print("the save path has already existed!")
        exit(0)
    
    setup_dirs(args)

    script_path = os.path.join(args.save_path, 'scripts')
    if not os.path.isdir(script_path):
        shutil.copytree('scripts', script_path)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)
    env = None # placeholder
    if 'carla9' in args.env:
        # select CARLA v0.9.x as the platform
        env = create_carla9_env(args)
    elif 'carla8' in args.env:
        # select CARLA v0.8.x as the platform
        from envs.CARLA.carla_lib.carla.client import make_carla_client
        from envs.CARLA.carla_env import CarlaEnv
        client = make_carla_client('localhost', args.port, CARLA8_TIMEOUT)
        env = CarlaEnv(client, args)
    else:
        # select PyTorcs or GTAV as the platform
        # which is basically inherited from SPC, not fully supported in IPC
        env = make_env(args)

    if args.eval:
        from evaluate import evaluate_policy
        evaluate_policy(args, env)
    else:
        from train import train_policy
        train_policy(args, env)
Beispiel #12
0
	def __init__(self,args):
		self.args = args
		self.device = torch.device('cuda') if args.cuda else torch.device('cpu')
		dummy_env = gym.make(self.args.env_name)
		self.actor = ACNet(dummy_env.action_space.n,args.feedforward)
		del dummy_env
		if args.load_dir is not None:
			actorState = torch.load(args.load_dir,map_location=lambda storage, loc: storage)
		if args.continue_training:
			self.actor.load_state_dict(actorState)
			print("Loaded pretrained model successfully")
		if args.transfer:
			self.actor.load_autoturn_model(actorState)
		if args.cuda:
			self.actor.cuda()
		self.actor_optimizer = optim.Adam(self.actor.parameters(),lr=self.args.lr)
		self.env_list = [make_env(self.args.env_name,self.args.seed,i) for i in range(self.args.num_processes)]
		if self.args.num_processes > 1:
			self.envs = gym_vecenv.SubprocVecEnv(self.env_list)
		else:
			self.envs = gym_vecenv.DummyVecEnv(self.env_list)
		if len(self.envs.observation_space.shape) == 1:
			self.envs = gym_vecenv.VecNormalize(self.envs)
		
		self.obs_shape = self.envs.observation_space.shape
		self.obs_shape = (self.obs_shape[0] * args.num_stack, *self.obs_shape[1:])
		self.state_shape = 1 if args.feedforward else 256
		self.rollouts = RolloutStorage(self.args.num_fwd_steps, self.args.num_processes, self.obs_shape, self.envs.action_space, self.state_shape)
		self.num_updates = int(args.num_frames)//args.num_fwd_steps//args.num_processes
		self.current_obs = torch.zeros(self.args.num_processes,*self.obs_shape)
		self.writer = SummaryWriter(log_dir=self.args.save_dir)
		self.fortress_threshold = 650
		self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.actor_optimizer,
											mode='max',factor=0.2,patience=15,verbose=True,threshold=1e-3,
											threshold_mode='rel')
Beispiel #13
0
    def __init__(self):

        # Create an instance of the network itself, as well as the memory.
        # Here is also a good place to set environmental parameters,
        # as well as training parameters - number of episodes / iterations, etc.

        args.log_dir = args.log_dir + args.env_name + '_' + args.algo
        try:
            os.makedirs(args.log_dir)
        except OSError:
            files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv'))
            for f in files:
                os.remove(f)

        envs = [
            make_env(args.env_name, args.seed, i, args.log_dir)
            for i in range(args.num_processes)
        ]

        if args.num_processes > 1:
            envs = SubprocVecEnv(envs)
        else:
            envs = DummyVecEnv(envs)

        if len(envs.observation_space.shape) == 1:
            envs = VecNormalize(envs)

        self.environment_name = args.env_name
        self.agent = VecEnvAgent(envs, args)
Beispiel #14
0
def eval_proc(file_name):
    print(file_name)
    f = open(os.path.join('./log_more', file_name), 'w+')
    types = ['RANDOM', 'RHCP', 'CDQN', 'MCT']
    # for role_id in [2, 3, 1]:
    #     for ta in types:
    #         agent = make_agent(ta, role_id)
    #         for i in range(1):
    #             env = make_env('MCT')
    #             st = StatCounter()
    #             for j in tqdm(range(100)):
    #                 winning_rate = eval_episode(env, agent)
    #                 st.feed(winning_rate)
    #             f.write('%s with role id %d against %s, winning rate: %f\n' % (ta, role_id, 'MCT', st.average))

    for role_id in [2, 3, 1]:
        agent = make_agent('MCT', role_id)
        for i in range(1):
            for te in types:
                env = make_env(te)
                st = StatCounter()
                for j in tqdm(range(100)):
                    winning_rate = eval_episode(env, agent)
                    st.feed(winning_rate)
                f.write('%s with role id %d against %s, winning rate: %f\n' % ('MCT', role_id, te, st.average))
    f.close()
def experiment_setup(args):
    env = make_env(args)
    env_test = make_env(args)
    if args.goal_based:
        args.obs_dims = list(goal_based_process(env.reset()).shape)
        args.acts_dims = [env.action_space.shape[0]]
        args.compute_reward = env.compute_reward
        args.compute_distance = env.compute_distance

    args.buffer = buffer = ReplayBuffer_Episodic(args)
    args.learner = learner = create_learner(args)
    args.agent = agent = create_agent(args)
    args.logger.info('*** network initialization complete ***')
    args.tester = tester = Tester(args)
    args.logger.info('*** tester initialization complete ***')

    return env, env_test, agent, buffer, learner, tester
Beispiel #16
0
def onlytest():
    # env = make_env('RHCP')
    #env = make_env('RANDOM')
    #env = make_env('MCT')
    # env = make_env('CDQN')

    env = make_env('CDQN')
    agent = make_agent('RANDOM', 1)
    eval_episode(env, agent)
Beispiel #17
0
    def __init__(self, args):
        # initialize environment
        self.args = args
        self.env = make_env(args)
        self.args.timesteps = self.env.max_episode_steps
        self.env_test = make_env(args)
        self.info = []
        self.test_rollouts = 100

        # get current policy from path (restore tf session + graph)
        self.play_dir = args.play_path
        self.play_epoch = args.play_epoch
        self.meta_path = os.path.join(self.play_dir, "saved_policy-{}.meta".format(self.play_epoch))
        self.sess = tf.Session()
        self.saver = tf.train.import_meta_graph(self.meta_path)
        self.saver.restore(self.sess, tf.train.latest_checkpoint(self.play_dir))
        graph = tf.get_default_graph()
        self.raw_obs_ph = graph.get_tensor_by_name("raw_obs_ph:0")
        self.pi = graph.get_tensor_by_name("main/policy/net/pi/Tanh:0")
Beispiel #18
0
def enjoy():
    env = make_env(args.env_name, args.seed, 0, True)
    env = DummyVecEnv([env])

    actor_critic, ob_rms = torch.load(
        os.path.join(save_path, args.env_name + ".pt"))

    render_func = env.envs[0].render

    obs_shape = env.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])
    current_obs = torch.zeros(1, *obs_shape)
    states = torch.zeros(1, actor_critic.state_size)
    masks = torch.zeros(1, 1)

    def update_current_obs(obs):
        shape_dim0 = env.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    render_func('human')
    obs = env.reset()
    update_current_obs(obs)

    while True:
        value, action, _, states = actor_critic.act(Variable(current_obs,
                                                             volatile=True),
                                                    Variable(states,
                                                             volatile=True),
                                                    Variable(masks,
                                                             volatile=True),
                                                    deterministic=True)
        states = states.data
        cpu_actions = action.data.squeeze(1).cpu().numpy()

        # Observation, reward and next obs
        obs, reward, done, _ = env.step(cpu_actions)

        time.sleep(0.05)

        masks.fill_(0.0 if done else 1.0)

        if current_obs.dim() == 4:
            current_obs *= masks.unsqueeze(2).unsqueeze(2)
        else:
            current_obs *= masks
        update_current_obs(obs)

        renderer = render_func('human')

        if not renderer.window:
            sys.exit(0)
Beispiel #19
0
    def __init__(self, args):
        self.args = args
        self.env = make_env(args)
        self.env_test = make_env(args)

        self.info = []
        if args.save_acc:
            make_dir('log/accs', clear=False)
            self.test_rollouts = 100

            self.env_List = []
            self.env_test_List = []
            for _ in range(self.test_rollouts):
                self.env_List.append(make_env(args))
                self.env_test_List.append(make_env(args))

            self.acc_record = {}
            self.acc_record[self.args.goal] = []
            for key in self.acc_record.keys():
                self.info.append('Success/' + key + '@blue')
Beispiel #20
0
 def __init__(self, individuals):
     self.render = False
     self.envs = [
         make_env("PommeFFAPartialFast-v0", 1, i, './tmp/gym/', False,
                  False) for i in range(individuals)
     ]
     #self.envs = make_vec_envs("PommeFFAPartialFast-v0",1, individuals, 0.99, False, 1,'./tmp/gym/', False, torch.device("cpu"), allow_early_resets=False)
     #self.env = cloudpickle.dumps(make_env("PommeFFAPartialFast-v0"))
     #if individuals==1:
     #    self.env = make_env("PommeFFAPartialFast-v0")
     #else:
     #    self.env = [make_env("PommeFFAPartialFast-v0") for _ in range(individuals)]
     self.train = True
Beispiel #21
0
def experiment_setup(args):
    env = make_env(args)
    args.acts_dims = env.acts_dims
    args.obs_dims = env.obs_dims

    args.buffer = buffer = create_buffer(args)
    args.agent = agent = create_agent(args)
    args.agent_graph = agent.graph
    args.learner = learner = create_learner(args)
    args.logger.info('*** network initialization complete ***')
    args.tester = tester = Tester(args)
    args.logger.info('*** tester initialization complete ***')

    return env, agent, buffer, learner, tester
Beispiel #22
0
	def __init__(self, args, test_rollouts=100, after_train_test=False):
		self.args = args
		self.env = make_env(args)
		self.env_test = make_env(args)

		self.info = []
		self.calls = 0
		self.after_train_test = after_train_test
		if args.save_acc:
			make_dir('log/accs', clear=False)
			self.test_rollouts = test_rollouts

			self.env_List = []
			for _ in range(self.test_rollouts):
				self.env_List.append(make_env(args))

			self.acc_record = {}
			self.acc_record[self.args.goal] = []
			for key in self.acc_record.keys():
				self.info.append('Success'+'@blue')
				self.info.append('MaxDistance')
				self.info.append('MinDistance')

		self.coll_tol = 0 #this attribute is just used for tests after training
Beispiel #23
0
    def make_envs(progress, ob_rms):
        envs = [
            make_env(changefun(env, progress), seed, i, log_dir, add_timestep)
            for i in range(num_processes)
        ]

        if num_processes > 1:
            envs = SubprocVecEnv(envs)
        else:
            envs = DummyVecEnv(envs)

        if len(envs.observation_space.shape) == 1:
            envs = VecNormalize(envs, gamma=gamma)
        if ob_rms is not None:
            envs.ob_rms = ob_rms
        return envs
Beispiel #24
0
def register_and_create_envs(id_tmp_dir, seed, environment, algorithm):
    """
    Args:
        id_temp_dir (str): Working directory.
        All other args are automatically provided by sacred
    """

    if environment['entry_point']:
        try:
            register(id=environment['name'],
                     entry_point=environment['entry_point'],
                     kwargs=environment['config'],
                     max_episode_steps=environment['max_episode_steps'])
        except Exception:
            pass

    num_envs = algorithm['num_processes']
    num_expert_envs = algorithm['num_expert_processes']

    envs = [
        make_env(environment['name'],
                 seed,
                 i,
                 id_tmp_dir,
                 occlusion=list(environment['occlusion']),
                 sensor_noise=float(environment['sensor_noise']))
        for i in range(num_envs - num_expert_envs)
    ]

    # Create "expert environments" which only replay trajectories from an expert database, rather than interacting with Gym.
    # See expert_envs.py for the exposed API. These environments are appended to the list of the Gym environments.
    if num_expert_envs:
        e_envs = [
            make_expert_envs(environment['name'], seed, i, num_expert_envs,
                             environment['expert_db'])
            for i in range(num_expert_envs)
        ]
        envs.extend(e_envs)

    # Vectorise envs
    if algorithm['num_processes'] > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    return envs
Beispiel #25
0
def evaluate_model_clipped(env,
                           model,
                           max_eval=20000,
                           env_seed=2018,
                           render=False,
                           cuda=False):
    env = make_env(env, env_seed, 0, None)()
    if isinstance(model, CompressedModel):
        model = uncompress_model(model)
    if cuda:
        model.cuda()
    obs_shape = env.observation_space.shape
    obs_shape = (obs_shape[0] * 4, *obs_shape[1:])
    current_obs = torch.zeros(1, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = env.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    if render: env.render()

    obs = env.reset()
    update_current_obs(obs)

    total_frames = 0
    model.eval()
    total_reward = 0.0
    for _ in range(max_eval):
        total_frames += 4
        cur_state_var = Variable(current_obs)
        if cuda:
            cur_state_var = cur_state_var.cuda()
        values = model(cur_state_var)[0]
        if cuda:
            values = values.cpu()
        action = np.argmax(values.data.numpy()[:env.action_space.n])
        new_state, reward, is_done, _ = step(env, action)
        total_reward += reward
        if is_done:
            break
        update_current_obs(new_state)
        if render: env.render()

    return total_reward, total_frames
Beispiel #26
0
    def __init__(self, args):
        self.args = args
        self.env = make_env(args)
        self.info = []

        if args.save_rews:
            make_dir('log/rews', clear=False)
            self.rews_record = {}
            self.rews_record[args.env] = []

        if args.save_Q:
            make_dir('log/Q_std', clear=False)
            make_dir('log/Q_net', clear=False)
            make_dir('log/Q_ground', clear=False)
            self.Q_std_record, self.Q_net_record, self.Q_ground_record = {}, {}, {}
            self.Q_std_record[args.env], self.Q_net_record[
                args.env], self.Q_ground_record[args.env] = [], [], []
            self.info += ['Q_error/mean', 'Q_error/std']
Beispiel #27
0
def eval_proc(file_name):
    print(file_name)
    f = open(os.path.join('./log') + file_name, 'w+')
    for te in types:
        for ta in types:
            for role_id in [2, 3, 1]:
                agent = make_agent(ta, role_id)
                for i in range(1):
                    env = make_env(te)
                    st = StatCounter()
                    with get_tqdm(total=100) as pbar:
                        for j in range(100):
                            winning_rate = eval_episode(env, agent)
                            st.feed(winning_rate)
                            pbar.update()
                    f.write(
                        '%s with role id %d against %s, winning rate: %f\n' %
                        (ta, role_id, te, st.average))
    f.close()
def sample(runs, iternum, root=ROOT, number=None):
	dirname = os.path.join(root, f"iter{iternum}/")
	os.makedirs(os.path.dirname(dirname), exist_ok=True)
	rollout = RolloutCollector(dirname)
	env = make_env()
	state_size = env.observation_space.shape
	action_size = [env.action_space.n] if hasattr(env.action_space, 'n') else env.action_space.shape
	agent = RandomAgent(state_size, action_size) if iternum <= 0 else ControlAgent(state_size, action_size, gpu=False, load=f"{env_name}/iter{iternum-1}")
	for ep in range(runs):
		state = env.reset()
		total_reward = 0
		done = False
		while not done:
			env_action = agent.get_env_action(env, state)[0]
			state, reward, done, _ = env.step(env_action)
			rollout.step(env_action, state, reward, done, number)
			agent.train(state, env_action, state, reward, done)
			total_reward += reward
		print(f"Ep: {ep}, Reward: {total_reward}")
	env.close()
Beispiel #29
0
def main():

    summary_dir = '/tmp/log'
    summary_dir = None
    render = '--visualise' in sys.argv[1:]

    prepare_process(summary_dir=summary_dir)

    if summary_dir is not None:
        summary_writer = tf.summary.FileWriter(
            '{}/learn.summary'.format(summary_dir))
    else:
        summary_writer = None

    learn(
        env=envs.make_env(),
        render=render,
        save_path='/tmp/hra.sword',
        #restore_path='/tmp/hra.sword-10000',
        summary_writer=summary_writer)
Beispiel #30
0
def create_environment(environment, add_timestep, tasks, seeds, seed):
    num_tasks = len(tasks)
    print("Creating environments...")
    print("Environment name: {}".format(environment))
    # env = make_env('TwoRooms-v2', seed=0, rank=0, add_timestep=False)
    envs = [
        make_env(environment, seed, i, add_timestep)
        for i in range(num_processes_per_task * num_tasks)
    ]
    envs = MTSubprocVecEnv(envs)

    # TODO: Replace with info dict!
    constraint = []
    start_constraint = []
    for task in tasks:
        constraint += [task] * num_processes_per_task
        start_constraint += [False] * num_processes_per_task

    seeds = envs.draw_and_set_task(constraint=constraint, seed=seeds)
    return envs, constraint
def train(model_dict):

    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:] 
        current_state[:, -channels:] = state #last frame is now the new one

        return current_state


    def update_rewards(reward, done, final_rewards, episode_rewards, current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1]
        episode_rewards += reward #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1]
        final_rewards *= masks #erase the ones that are done
        final_rewards += (1 - masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks #erase the done ones
        masks = masks.type(dtype) #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks   #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state



    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']
    ls_ = model_dict['ls_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype']=dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype']=dtype


    # Create environments
    print (num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print ('Made dir', monitor_rewards_dir) 
    envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)])


    if vid_:
        print ('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print ('env for gif')
        envs_gif = make_env_basic(env_name)

    if ls_:
        print ('env for ls')
        envs_ls = make_env_basic(env_name)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:])  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape']=obs_shape
    model_dict['shape_dim0']=shape_dim0



    # # Create agent
    # if algo == 'a2c':
    #     agent = a2c(envs, model_dict)
    #     print ('init a2c agent')
    # elif algo == 'ppo':
    #     agent = ppo(envs, model_dict)
    #     print ('init ppo agent')
    # elif algo == 'a2c_minibatch':
    #     agent = a2c_minibatch(envs, model_dict)
    #     print ('init a2c_minibatch agent')
    # elif algo == 'a2c_list_rollout':
    #     agent = a2c_list_rollout(envs, model_dict)
    #     print ('init a2c_list_rollout agent')
    # elif algo == 'a2c_with_var':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')

    # elif algo == 'a2c_bin_mask':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # agent = model_dict['agent'](envs, model_dict)

    # #Load model
    # if model_dict['load_params']:
    #     # agent.actor_critic = torch.load(os.path.join(args.load_path))
    #     # agent.actor_critic = torch.load(args.load_path).cuda()
        
    #     # print ('loaded ', args.load_path)

    #     if model_dict['load_number'] == 3:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 3000160, model_dict)

    #     elif model_dict['load_number'] == 6:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 6000160, model_dict)
    #     elif model_dict['load_number'] == 9:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 9000160, model_dict)

    #     # else:
    #     #     load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 8000160, model_dict)
    #     else:
    #         PROBLEM


    print ('Init expert agent')
    expert_agent = a2c(envs, model_dict)
    param_file = home+'/Documents/tmp/breakout_2frames_leakyrelu2/BreakoutNoFrameskip-v4/A2C/seed0/model_params3/model_params9999360.pt'    
    param_dict = torch.load(param_file)
    expert_agent.actor_critic.load_state_dict(param_dict)
    print ('loaded params', param_file)
    expert_agent.actor_critic.cuda()



    print ('Init imitator agent')
    imitator_agent = a2c(envs, model_dict)
    # param_file = home+'/Documents/tmp/breakout_2frames_leakyrelu2/imitator_params.ckpt'  
    # param_dict = torch.load(param_file)
    # imitator_agent.actor_critic.load_state_dict(param_dict)
    # print ('loaded params', param_file)
    imitator_agent.actor_critic.cuda()







    agent = expert_agent
    expert_policy = expert_agent.actor_critic

    imitator_policy = imitator_agent.actor_critic
    optimizer = optim.Adam(imitator_policy.parameters(), lr=.0005, weight_decay=.00001)

    total_steps = 0

    display_step = 50






    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(num_processes, *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest
    agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step 

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes
    save_interval_num_updates = int(save_interval /num_processes/num_steps)




    #Begin training
    # count =0
    start = time.time()
    start2 = time.time()
    for j in range(num_updates):
        for step in range(num_steps):

            # Act, [P,1], [P], [P,1], [P]
            # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True))
            state__ = Variable(agent.rollouts.states[step]) / 255.
            value, action, action_log_probs, dist_entropy = agent.act(state__) #, requires_grad=False)#, volatile=True))
            # print (action_log_probs.size())
            # print (dist_entropy.size())





            batch = state__

            optimizer.zero_grad()

            log_dist_expert = expert_policy.action_logdist(batch)
            log_dist_imitator = imitator_policy.action_logdist(batch)

            action_dist_kl = torch.sum((log_dist_expert - log_dist_imitator)*torch.exp(log_dist_expert), dim=1) #[B]

            # elbo, logpx, logpz, logqz, action_dist_kl = self.forward(batch, policy, k=k)
            loss = torch.mean(action_dist_kl)

            loss.backward()
            # nn.utils.clip_grad_norm(self.parameters(), .5)
            optimizer.step()

            # if total_steps%display_step==0: # and batch_idx == 0:
            #     # print ('Train Epoch: {}/{}'.format(epoch+1, epochs),
            #         # 'total_epochs {}'.format(total_epochs),
            #         print('LL:{:.4f}'.format(loss.data[0])
            #         # 'logpx:{:.4f}'.format(logpx.data[0]),
            #         # 'logpz:{:.5f}'.format(logpz.data[0]),
            #         # 'logqz:{:.5f}'.format(logqz.data[0]),
            #         # 'action_kl:{:.4f}'.format(action_dist_kl.data[0])
            #         )

            # total_steps+=1






            cpu_actions = action.data.squeeze(1).cpu().numpy() #[P]
            # cpu_actions = action.data.cpu().numpy() #[P]
            # print (actions.size())

            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions) 

            # Record rewards and update state
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state)
            current_state = update_current_state(current_state, state, shape_dim0)

            # Agent record step
            # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data)
            agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy) #, done)





        #Optimize agent


        agent.no_update()  #agent.update(j,num_updates)
        # agent.update()  #agent.update(j,num_updates)


        agent.insert_first_state(agent.rollouts.states[-1])


        # print ('save_interval_num_updates', save_interval_num_updates)
        # print ('num_updates', num_updates)
        # print ('j', j)
        total_num_steps = (j + 1) * num_processes * num_steps
        
        # if total_num_steps % save_interval == 0 and save_dir != "":
        if j % save_interval_num_updates == 0 and save_dir != "" and j != 0:

            save_to = home+'/Documents/tmp/breakout_2frames_leakyrelu2/imitator_params_env.ckpt'
            torch.save(imitator_policy.state_dict(), save_to)
            print ('saved imitator_policy', save_to)

            # #Save model
            # if save_params:
            #     do_params(save_dir, agent, total_num_steps, model_dict)
            #     # save_params_v2(save_dir, agent, total_num_steps, model_dict)

                
            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps)


        #Print updates
        if j % log_interval == 0:# and j!=0:
            end = time.time()

            to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}, {:.4f}".format(j, total_num_steps,
                                       final_rewards.min(),
                                       final_rewards.median(),
                                       final_rewards.mean(),
                                       final_rewards.max(),
                                       int(total_num_steps / (end - start)),
                                       end - start,
                                       end - start2, 
                                       loss.data[0])


            # to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}".format(j, total_num_steps,
            #                            final_rewards.min(),
            #                            final_rewards.median(),
            #                            final_rewards.mean(),
            #                            final_rewards.max(),
            #                            int(total_num_steps / (end - start)),
            #                            end - start,
            #                            end - start2)


            print(to_print_info_string) 
            start2 = time.time()



            to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, Time"
            if j % (log_interval*30) == 0:
            
                if ls_:
                    do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards)
                # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated")
                # print(to_print_info_string + ' LS recorded')#, agent.current_lr)
                # else:
                #update plots
                try:
                    if ls_:
                        update_ls_plot(model_dict)
                    make_plots(model_dict)
                    print(to_print_legend_string + " Plot updated")
                except:
                    raise #pass
                    print(to_print_legend_string)



    try:
        make_plots(model_dict)
    except:
        print ()
Beispiel #32
0
def train(model_dict):

    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:] 
        current_state[:, -channels:] = state #last frame is now the new one

        return current_state


    def update_rewards(reward, done, final_rewards, episode_rewards, current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1]
        episode_rewards += reward #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1]
        final_rewards *= masks #erase the ones that are done
        final_rewards += (1 - masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks #erase the done ones
        masks = masks.type(dtype) #cuda

        # print (current_state)
        # fdsf

        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks   #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state



    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']
    ls_ = model_dict['ls_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype']=dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype']=dtype


    # Create environments
    print (num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print ('Made dir', monitor_rewards_dir) 
    envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)])


    if vid_:
        print ('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print ('env for gif')
        envs_gif = make_env_basic(env_name)

    if ls_:
        print ('env for ls')
        envs_ls = make_env_basic(env_name)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:])  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape']=obs_shape
    model_dict['shape_dim0']=shape_dim0

    action_size = envs.action_space.n
    model_dict['action_size']=action_size



    # Create agent
    if algo == 'a2c':
        agent = a2c(model_dict)
        print ('init a2c agent')
    # elif algo == 'ppo':
    #     agent = ppo(envs, model_dict)
    #     print ('init ppo agent')
    # elif algo == 'a2c_minibatch':
    #     agent = a2c_minibatch(envs, model_dict)
    #     print ('init a2c_minibatch agent')
    # elif algo == 'a2c_list_rollout':
    #     agent = a2c_list_rollout(envs, model_dict)
    #     print ('init a2c_list_rollout agent')
    # elif algo == 'a2c_with_var':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # elif algo == 'a2c_bin_mask':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # agent = model_dict['agent'](envs, model_dict)

    #Load model
    if model_dict['load_params']:
        # agent.actor_critic = torch.load(os.path.join(args.load_path))
        # agent.actor_critic = torch.load(args.load_path).cuda()
        
        # print ('loaded ', args.load_path)

        if model_dict['load_number'] == 3:
            load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 3000160, model_dict)

        elif model_dict['load_number'] == 6:
            load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 6000160, model_dict)
        elif model_dict['load_number'] == 9:
            load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 9000160, model_dict)

        # else:
        #     load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 8000160, model_dict)
        else:
            PROBLEM















    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(num_processes, *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest
    agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step 

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes
    save_interval_num_updates = int(save_interval /num_processes/num_steps)

    #Begin training
    # count =0
    start = time.time()
    start2 = time.time()
    for j in range(num_updates):
        for step in range(num_steps):

            # Act, [P,1], [P], [P,1], [P]
            # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True))
            value, action, action_log_probs, dist_entropy = agent.act(Variable(agent.rollouts.states[step]/255.))#, volatile=True))
            # print (action_log_probs.size())
            # print (dist_entropy.size())

            cpu_actions = action.data.squeeze(1).cpu().numpy() #[P]
            # cpu_actions = action.data.cpu().numpy() #[P]
            # print (actions.size())

            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions) 


            # if np.sum(reward) > 0.:
            #     print (reward)
            #     afdas

            # Record rewards and update state
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state)
            current_state = update_current_state(current_state, state, shape_dim0)

            # Agent record step
            # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data)
            agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy) #, done)





        #Optimize agent
        agent.update()  #agent.update(j,num_updates)
        agent.insert_first_state(agent.rollouts.states[-1])


        # print ('save_interval_num_updates', save_interval_num_updates)
        # print ('num_updates', num_updates)
        # print ('j', j)
        total_num_steps = (j + 1) * num_processes * num_steps
        
        # if total_num_steps % save_interval == 0 and save_dir != "":
        if j % save_interval_num_updates == 0 and save_dir != "" and j != 0:

            #Save model
            if save_params:
                # do_params(save_dir, agent, total_num_steps, model_dict)
                # save_params_v2(save_dir, agent, total_num_steps, model_dict)
                save_params_v3(save_dir, agent, total_num_steps, model_dict)
            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps)


        #Print updates
        if j % log_interval == 0:# and j!=0:
            end = time.time()

            to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}".format(j, total_num_steps,
                                       final_rewards.min(),
                                       final_rewards.median(),
                                       final_rewards.mean(),
                                       final_rewards.max(),
                                       int(total_num_steps / (end - start)),
                                       end - start,
                                       end - start2)
            print(to_print_info_string) 
            start2 = time.time()



            to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, Time"
            if j % (log_interval*30) == 0:
            
                if ls_:
                    do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards)
                # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated")
                # print(to_print_info_string + ' LS recorded')#, agent.current_lr)
                # else:
                #update plots
                try:
                    if ls_:
                        update_ls_plot(model_dict)
                    make_plots(model_dict)
                    print(to_print_legend_string + " Plot updated")
                except:
                    raise #pass
                    print(to_print_legend_string)



    try:
        make_plots(model_dict)
    except:
        print ()
Beispiel #33
0
def train(model_dict):

    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:] 
        current_state[:, -channels:] = state #last frame is now the new one

        return current_state


    def update_rewards(reward, done, final_rewards, episode_rewards, current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1]
        episode_rewards += reward #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1]
        final_rewards *= masks #erase the ones that are done
        final_rewards += (1 - masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks #erase the done ones
        masks = masks.type(dtype) #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks   #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state



    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    save_params = model_dict['save_params']
    vid_ = model_dict['vid_']
    gif_ = model_dict['gif_']
    ls_ = model_dict['ls_']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
        model_dict['dtype']=dtype
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor
        model_dict['dtype']=dtype


    # Create environments
    print (num_processes, 'processes')
    monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    if not os.path.exists(monitor_rewards_dir):
        os.makedirs(monitor_rewards_dir)
        print ('Made dir', monitor_rewards_dir) 
    envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)])


    if vid_:
        print ('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    if gif_:
        print ('env for gif')
        envs_gif = make_env_basic(env_name)

    if ls_:
        print ('env for ls')
        envs_ls = make_env_basic(env_name)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:])  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape']=obs_shape
    model_dict['shape_dim0']=shape_dim0



    # Create agent
    if algo == 'a2c':
        agent = a2c(envs, model_dict)
        print ('init a2c agent')
    # elif algo == 'ppo':
    #     agent = ppo(envs, model_dict)
    #     print ('init ppo agent')
    # elif algo == 'a2c_minibatch':
    #     agent = a2c_minibatch(envs, model_dict)
    #     print ('init a2c_minibatch agent')
    # elif algo == 'a2c_list_rollout':
    #     agent = a2c_list_rollout(envs, model_dict)
    #     print ('init a2c_list_rollout agent')
    # elif algo == 'a2c_with_var':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # elif algo == 'a2c_bin_mask':
    #     agent = a2c_with_var(envs, model_dict)
    #     print ('init a2c_with_var agent')
    # agent = model_dict['agent'](envs, model_dict)

    # #Load model
    # if model_dict['load_params']:
    #     # agent.actor_critic = torch.load(os.path.join(args.load_path))
    #     # agent.actor_critic = torch.load(args.load_path).cuda()
        
    #     # print ('loaded ', args.load_path)

    #     if model_dict['load_number'] == 3:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 3000160, model_dict)

    #     elif model_dict['load_number'] == 6:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 6000160, model_dict)
    #     elif model_dict['load_number'] == 9:
    #         load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 9000160, model_dict)

    #     # else:
    #     #     load_params_v2(home+'/Documents/tmp/confirm_works_1_withsaving/PongNoFrameskip-v4/a2c/seed0/', agent, 8000160, model_dict)
    #     else:
    #         PROBLEM






    #load model
    # if model_dict['load_params']:

    # load_params(thigns)
    # param_file = home+'/Documents/tmp/breakout_2frames/BreakoutNoFrameskip-v4/A2C/seed0/model_params/model_params9999360.pt'
    param_file = home+'/Documents/tmp/RoadRunner/RoadRunnerNoFrameskip-v4/A2C/seed1/model_params3/model_params9999360.pt'


    # pretrained_dict = torch.load(param_file)  # object
    # print (pretrained_dict)
    # agent_dict = agent.actor_critic.state_dict()  #dict
    # print (agent_dict.keys())
    # agent_dict.update(pretrained_dict)
    # # agent_dict.update(agent.actor_critic)
    # agent.actor_critic.load_state_dict(agent_dict)


    param_dict = torch.load(param_file)
    agent.actor_critic.load_state_dict(param_dict)


    # agent.actor_critic = torch.load(param_file)
    agent.actor_critic.cuda()
    print ('loaded', param_file)

    # afdsa







    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(num_processes, *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest
    agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step 

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes
    save_interval_num_updates = int(save_interval /num_processes/num_steps)

    # list of lists, where lists are trajectories. trajectories have actinos and states 
    dataset = []
    tmp_trajs = [[] for x in range(num_processes)]


    dataset_count = 0


    done = [0]*num_processes

    #Begin training
    # count =0
    start = time.time()
    start2 = time.time()
    for j in range(num_updates):
        for step in range(num_steps):

            # Act, [P,1], [P], [P,1], [P]
            # value, action = agent.act(Variable(agent.rollouts.states[step], volatile=True))
            value, action, action_log_probs, dist_entropy = agent.act(Variable(agent.rollouts.states[step]))#, volatile=True))
            # print (action_log_probs.size())
            # print (dist_entropy.size())

            cpu_actions = action.data.squeeze(1).cpu().numpy() #[P]
            # cpu_actions = action.data.cpu().numpy() #[P]
            # print (actions.size())






            # y = torch.LongTensor(batch_size,1).random_() % nb_digits
            # # One hot encoding buffer that you create out of the loop and just keep reusing
            # y_onehot = torch.FloatTensor(batch_size, nb_digits)
            # # In your for loop
            # y_onehot.zero_()
            # y_onehot.scatter_(1, y, 1)



            states_ = agent.rollouts.states[step].cpu().numpy()  #[P,S,84,84]
            # print (state_t.shape)
            actions_ = action.data.cpu().numpy() #[P,1]
            # print (action)
            # fdsaf


            #store step
            for proc in range(num_processes):

                #add states
                state_t = states_[proc]
                action_t = actions_[proc]
                tmp_trajs[proc].append([action_t, state_t])

                if done[proc]:

                    dataset.append(tmp_trajs[proc])
                    dataset_count += len(tmp_trajs[proc])
                    tmp_trajs[proc] = []

                    for ii in range(len(dataset)):
                        print (len(dataset[ii]))


            if dataset_count > 10000:

                # pickle.dump( dataset, open(home+'/Documents/tmp/breakout_2frames/breakout_trajectories_10000.pkl', "wb" ) )
                pickle.dump( dataset, open(home+'/Documents/tmp/RoadRunner/trajectories_10000.pkl', "wb" ) )

                print('saved')
                # pickle.save(dataset)
                STOP





            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions) 







            # Record rewards and update state
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state)
            current_state = update_current_state(current_state, state, shape_dim0)

            # Agent record step
            # agent.insert_data(step, current_state, action.data, value.data, reward, masks, action_log_probs.data, dist_entropy.data)
            agent.insert_data(step, current_state, action.data, value, reward, masks, action_log_probs, dist_entropy) #, done)






        # print (len(dataset))
        # print ()





        #Optimize agent
        # agent.update()  #agent.update(j,num_updates)
        agent.insert_first_state(agent.rollouts.states[-1])


        # print ('save_interval_num_updates', save_interval_num_updates)
        # print ('num_updates', num_updates)
        # print ('j', j)
        total_num_steps = (j + 1) * num_processes * num_steps
        
        # if total_num_steps % save_interval == 0 and save_dir != "":
        if j % save_interval_num_updates == 0 and save_dir != "" and j != 0:

            #Save model
            if save_params:
                do_params(save_dir, agent, total_num_steps, model_dict)
                # save_params_v2(save_dir, agent, total_num_steps, model_dict)
            #make video
            if vid_:
                do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps)
            #make gif
            if gif_:
                do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps)


        #Print updates
        if j % log_interval == 0:# and j!=0:
            end = time.time()

            to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.1f}".format(j, total_num_steps,
                                       final_rewards.min(),
                                       final_rewards.median(),
                                       final_rewards.mean(),
                                       final_rewards.max(),
                                       int(total_num_steps / (end - start)),
                                       end - start,
                                       end - start2)
            print(to_print_info_string) 
            start2 = time.time()



            to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, Time"
            if j % (log_interval*30) == 0:
            
                if ls_:
                    do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards)
                # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated")
                # print(to_print_info_string + ' LS recorded')#, agent.current_lr)
                # else:
                #update plots
                try:
                    if ls_:
                        update_ls_plot(model_dict)
                    make_plots(model_dict)
                    print(to_print_legend_string + " Plot updated")
                except:
                    raise #pass
                    print(to_print_legend_string)



    try:
        make_plots(model_dict)
    except:
        print ()
def viz(model_dict):

    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:] 
        current_state[:, -channels:] = state #last frame is now the new one



        return current_state


    def update_rewards(reward, done, final_rewards, episode_rewards, current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1]
        episode_rewards += reward #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1]
        final_rewards *= masks #erase the ones that are done
        final_rewards += (1 - masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks #erase the done ones
        masks = masks.type(dtype) #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks   #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state

    def do_vid():
        n_vids=3
        for i in range(n_vids):
            done=False
            state = envs_video.reset()
            # state = torch.from_numpy(state).float().type(dtype)
            current_state = torch.zeros(1, *obs_shape)
            current_state = update_current_state(current_state, state, shape_dim0).type(dtype)
            # print ('Recording')
            # count=0
            while not done:
                # print (count)
                # count +=1
                # Act
                state_var = Variable(current_state, volatile=True) 
                # print (state_var.size())
                action, value = agent.act(state_var)
                cpu_actions = action.data.squeeze(1).cpu().numpy()

                # Observe reward and next state
                state, reward, done, info = envs_video.step(cpu_actions) # state:[nProcesss, ndims, height, width]
                # state = torch.from_numpy(state).float().type(dtype)
                # current_state = torch.zeros(1, *obs_shape)
                current_state = update_current_state(current_state, state, shape_dim0).type(dtype)
        state = envs_video.reset()
        
        vid_path = save_dir+'/videos/'
        count =0
        for aaa in os.listdir(vid_path):

            if 'openaigym' in aaa and '.mp4' in aaa:
                #os.rename(vid_path+aaa, vid_path+'vid_t'+str(total_num_steps)+'.mp4')
                subprocess.call("(cd "+vid_path+" && mv "+ vid_path+aaa +" "+ vid_path+env_name+'_'+algo+'_vid_t'+str(total_num_steps)+'_'+str(count) +".mp4)", shell=True) 
                count+=1
            if '.json' in aaa:
                os.remove(vid_path+aaa)




    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    
    num_processes = 1
    model_dict['num_processes'] = 1
    
    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor


    # Create environments
    print (num_processes, 'processes')
    # monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards')
    # if not os.path.exists(monitor_rewards_dir):
    #     os.makedirs(monitor_rewards_dir)
    #     print ('Made dir', monitor_rewards_dir) 

    monitor_rewards_dir = ''
    envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)])


    vid_ = 0
    see_frames = 1

    if vid_:
        print ('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:])  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape']=obs_shape


    # Create agent
    if algo == 'a2c':
        agent = a2c(envs, model_dict)
        print ('init a2c agent')
    elif algo == 'ppo':
        agent = ppo(envs, model_dict)
        print ('init ppo agent')
    elif algo == 'a2c_minibatch':
        agent = a2c_minibatch(envs, model_dict)
        print ('init a2c_minibatch agent')
    # agent = model_dict['agent'](envs, model_dict)




    #Load model
    # if args.load_path != '':
        # agent.actor_critic = torch.load(os.path.join(args.load_path))

    # epoch_level = 1e6
    model_params_file = save_dir+ '/model_params/model_params'+str(int(epoch_level))+'.pt'
    agent.actor_critic = torch.load(model_params_file).cuda()
    print ('loaded ', model_params_file)
    # fafdas


    # frame_path = save_dir+'/frames/'
    if not os.path.exists(frame_path):
        os.makedirs(frame_path)
        print ('Made dir', frame_path) 




    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(num_processes, *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest
    agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step 

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes

    #Begin training
    count =0
    start = time.time()
    for j in range(num_updates):
        for step in range(num_steps):

            # if see_frames:
            #Grayscale
            # save_frame(state, count)




            # #RGB
            # state = envs.render()
            # print(state.shape)
            # fdsafa


            values = []
            actions = []
            for ii in range(100):
                # Act, [P,1], [P,1]
                action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True))
                val = value.data.cpu().numpy()[0][0]
                act_ = action.data.cpu().numpy()[0][0]
                # print ('value', val)
                # print ('action', act_)
                values.append(val)
                actions.append(act_)

            # print ('values', values)
            # print ('actions', actions)

            rows = 1
            cols = 2

            fig = plt.figure(figsize=(8,4), facecolor='white')

            # plot frame
            ax = plt.subplot2grid((rows,cols), (0,0), frameon=False)

            state1 = np.squeeze(state[0])
            ax.imshow(state1, cmap='gray')
            ax.set_xticks([])
            ax.set_yticks([])
            # ax.savefig(frame_path+'frame' +str(count)+'.png')
            # print ('saved',frame_path+'frame' +str(count)+'.png')
            # plt.close(fig)


            #plot values histogram
            ax = plt.subplot2grid((rows,cols), (0,1), frameon=False)

            weights = np.ones_like(values)/float(len(values))
            ax.hist(values, 50, range=[0.0, 4.], weights=weights)
            # ax.set_ylim(top=1.)
            ax.set_ylim([0.,1.])

            plt_path = frame_path+'plt' 
            plt.savefig(plt_path+str(count)+'.png')
            print ('saved',plt_path+str(count)+'.png')
            plt.close(fig)
            # fsadf



            count+=1
            if count > 2:
                if done[0] or count > max_frames:
                    ffsdfa





                # action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True))
                # print ('value', value)
                # print ('action', action)

                # action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True))
                # print ('value', value)
                # print ('action', action)


            
            cpu_actions = action.data.squeeze(1).cpu().numpy() #[P]

            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions) 



            # Record rewards
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state)
            
            # Update state
            current_state = update_current_state(current_state, state, shape_dim0)

            # Agent record step
            agent.insert_data(step, current_state, action.data, value.data, reward, masks)



        # #Optimize agent
        # agent.update()  #agent.update(j,num_updates)
        # agent.insert_first_state(agent.rollouts.states[-1])




        total_num_steps = (j + 1) * num_processes * num_steps
def viz(model_dict):

    def update_current_state(current_state, state, channels):
        # current_state: [processes, channels*stack, height, width]
        state = torch.from_numpy(state).float()  # (processes, channels, height, width)
        # if num_stack > 1:
        #first stack*channel-channel frames = last stack*channel-channel , so slide them forward
        current_state[:, :-channels] = current_state[:, channels:] 
        current_state[:, -channels:] = state #last frame is now the new one



        return current_state


    def update_rewards(reward, done, final_rewards, episode_rewards, current_state):
        # Reward, Done: [P], [P]
        # final_rewards, episode_rewards: [P,1]. [P,1]
        # current_state: [P,C*S,H,W]
        reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1]
        episode_rewards += reward #keeps track of current episode cumulative reward
        masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1]
        final_rewards *= masks #erase the ones that are done
        final_rewards += (1 - masks) * episode_rewards  #set it to the cumulative episode reward
        episode_rewards *= masks #erase the done ones
        masks = masks.type(dtype) #cuda
        if current_state.dim() == 4:  # if state is a frame/image
            current_state *= masks.unsqueeze(2).unsqueeze(2)  #[P,1,1,1]
        else:
            current_state *= masks   #restart the done ones, by setting the state to zero
        return reward, masks, final_rewards, episode_rewards, current_state




    num_frames = model_dict['num_frames']
    cuda = model_dict['cuda']
    which_gpu = model_dict['which_gpu']
    num_steps = model_dict['num_steps']
    num_processes = model_dict['num_processes']
    seed = model_dict['seed']
    env_name = model_dict['env']
    save_dir = model_dict['save_to']
    num_stack = model_dict['num_stack']
    algo = model_dict['algo']
    save_interval = model_dict['save_interval']
    log_interval = model_dict['log_interval']

    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu)

    
    num_processes = 1
    model_dict['num_processes'] = 1
    model_dict['num_steps'] = max_frames
    num_steps = max_frames
    
    if cuda:
        torch.cuda.manual_seed(seed)
        dtype = torch.cuda.FloatTensor
    else:
        torch.manual_seed(seed)
        dtype = torch.FloatTensor


    # Create environments
    print (num_processes, 'processes')

    monitor_rewards_dir = ''
    envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)])


    vid_ = 0
    see_frames = 1

    if vid_:
        print ('env for video')
        envs_video = make_env_monitor(env_name, save_dir)

    obs_shape = envs.observation_space.shape  # (channels, height, width)
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:])  # (channels*stack, height, width)
    shape_dim0 = envs.observation_space.shape[0]  #channels

    model_dict['obs_shape']=obs_shape


    # Create agent
    if algo == 'a2c':
        agent = a2c(envs, model_dict)
        print ('init a2c agent')
    elif algo == 'ppo':
        agent = ppo(envs, model_dict)
        print ('init ppo agent')
    elif algo == 'a2c_minibatch':
        agent = a2c_minibatch(envs, model_dict)
        print ('init a2c_minibatch agent')
    # agent = model_dict['agent'](envs, model_dict)




    #Load model
    model_params_file = save_dir+ '/model_params/model_params'+str(int(epoch_level))+'.pt'
    agent.actor_critic = torch.load(model_params_file).cuda()
    print ('loaded ', model_params_file)
    # fafdas


    # frame_path = save_dir+'/frames/'
    if not os.path.exists(frame_path):
        os.makedirs(frame_path)
        print ('Made dir', frame_path) 




    # Init state
    state = envs.reset()  # (processes, channels, height, width)
    current_state = torch.zeros(num_processes, *obs_shape)  # (processes, channels*stack, height, width)
    current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest
    agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step 

    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward
    final_rewards = torch.zeros([num_processes, 1])

    num_updates = int(num_frames) // num_steps // num_processes

    #Begin training
    count =0
    start = time.time()
    for j in range(num_updates):
        for step in range(num_steps):

            # if see_frames:
            #Grayscale
            # save_frame(state, count)




            # #RGB
            # state = envs.render()
            # print(state.shape)
            # fdsafa


        #         def get_action_meanings(self):
        # return [ACTION_MEANING[i] for i in self._action_set]

            # print (envs.get_action_meanings())

            # print (agent.rollouts.states[step].size())


            

            # print ('values', values)
            # print ('actions', actions)





            # rows = 1
            # cols = 3

            # fig = plt.figure(figsize=(8,4), facecolor='white')

            # # plot frame
            # ax = plt.subplot2grid((rows,cols), (0,0), frameon=False)

            # state1 = np.squeeze(state[0])
            # ax.imshow(state1, cmap='gray')
            # ax.set_xticks([])
            # ax.set_yticks([])
            # # ax.savefig(frame_path+'frame' +str(count)+'.png')
            # # print ('saved',frame_path+'frame' +str(count)+'.png')
            # # plt.close(fig)
            # ax.set_title('State',family='serif')





            # #plot values histogram
            # ax = plt.subplot2grid((rows,cols), (0,2), frameon=False)

            # values = []
            # actions = []
            # for ii in range(100):
            #     # Act, [P,1], [P,1]
            #     action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True))
            #     val = value.data.cpu().numpy()[0][0]
            #     act_ = action.data.cpu().numpy()[0][0]
            #     # print ('value', val)
            #     # print ('action', act_)
            #     values.append(val)
            #     actions.append(act_)


            # weights = np.ones_like(values)/float(len(values))
            # ax.hist(values, 50, range=[0.0, 4.], weights=weights)
            # # ax.set_ylim(top=1.)
            # ax.set_ylim([0.,1.])

            # ax.set_title('Value',family='serif')







            # #plot actions
            # ax = plt.subplot2grid((rows,cols), (0,1), frameon=False)

            # action_prob = agent.actor_critic.action_dist(Variable(agent.rollouts.states[step], volatile=True))
            # action_prob = np.squeeze(action_prob.data.cpu().numpy())
            # action_size = envs.action_space.n

            # # print (action_prob.shape)

            # ax.bar(range(action_size), action_prob)

            # ax.set_title('Action',family='serif')
            # # ax.set_xticklabels(['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE'])
            # plt.xticks(range(action_size),['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'R_FIRE', 'L_FIRE'], fontsize=6)
            # ax.set_ylim([0.,1.])



            # # print (action_prob)
            # # ['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']
            # # fdsfas

            # plt.tight_layout(pad=3., w_pad=2.5, h_pad=1.0)

            # plt_path = frame_path+'plt' 
            # plt.savefig(plt_path+str(count)+'.png')
            # print ('saved',plt_path+str(count)+'.png')
            # plt.close(fig)
            # # fsadf




            count+=1
            if count % 10 ==0:
                print (count)

            if count > 2:
                if reward.cpu().numpy() > 0:
                    # print (, reward.cpu().numpy(), count)
                    print (done[0],masks.cpu().numpy(), reward.cpu().numpy(),'reward!!', step)
                    print (np.squeeze(agent.rollouts.rewards.cpu().numpy()))
                else:
                    print (done[0],masks.cpu().numpy(), reward.cpu().numpy())


                # if done[0] or count > max_frames:
                if count > max_frames:

                    next_value = agent.actor_critic(Variable(agent.rollouts.states[-1], volatile=True))[0].data
                    agent.rollouts.compute_returns(next_value, agent.use_gae, agent.gamma, agent.tau)

                    rollouts_ =  np.squeeze(agent.rollouts.returns.cpu().numpy())
                    rewards_ =  np.squeeze(agent.rollouts.rewards.cpu().numpy())
                    # rollouts_ =  np.squeeze(agent.rollouts.returns.cpu().numpy())
                    # rollouts_ =  np.squeeze(agent.rollouts.returns.cpu().numpy())


                    for jj in range(len(rollouts_)):

                        print (jj, rollouts_[jj], rewards_[jj])
                    ffsdfa






                # action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True))
                # print ('value', value)
                # print ('action', action)

                # action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True))
                # print ('value', value)
                # print ('action', action)


            action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True))

            cpu_actions = action.data.squeeze(1).cpu().numpy() #[P]

            # Step, S:[P,C,H,W], R:[P], D:[P]
            state, reward, done, info = envs.step(cpu_actions) 



            # Record rewards
            reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state)
            
            # Update state
            current_state = update_current_state(current_state, state, shape_dim0)

            # Agent record step
            agent.insert_data(step, current_state, action.data, value.data, reward, masks)


            # print (reward)






        total_num_steps = (j + 1) * num_processes * num_steps