Esempio n. 1
0
    def __init__(self, env_fns, spaces=None):
        """
        envs: list of gym environments to run in subprocesses
        """
        self.waiting = False
        self.closed = False
        nenvs = len(env_fns)
        self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
        #set_start_method('forkserver')
        set_start_method('spawn')
        #set_start_method('fork')
        self.ps = [
            Process(target=worker,
                    args=(work_remote, remote, CloudpickleWrapper(env_fn)))
            for (work_remote, remote,
                 env_fn) in zip(self.work_remotes, self.remotes, env_fns)
        ]
        for p in self.ps:
            p.daemon = True  # if the main process crashes, we should not cause things to hang
            p.start()
        for remote in self.work_remotes:
            remote.close()

        self.remotes[0].send(('get_spaces', None))
        observation_space, action_space = self.remotes[0].recv()
        VecEnv.__init__(self, len(env_fns), observation_space, action_space)
Esempio n. 2
0
    def start_interaction(self, env_fns, dynamics, nlump=2):
        self.loss_names, self._losses = zip(*list(self.to_report.items()))

        params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
        if MPI.COMM_WORLD.Get_size() > 1:
            trainer = MpiAdamOptimizer(learning_rate=self.ph_lr,
                                       comm=MPI.COMM_WORLD)
        else:
            trainer = tf.train.AdamOptimizer(learning_rate=self.ph_lr)
        gradsandvars = trainer.compute_gradients(self.total_loss, params)
        self._train = trainer.apply_gradients(gradsandvars)

        if MPI.COMM_WORLD.Get_rank() == 0:
            getsess().run(
                tf.variables_initializer(
                    tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)))
        bcast_tf_vars_from_root(
            getsess(), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))

        self.all_visited_rooms = []
        self.all_scores = []
        self.nenvs = nenvs = len(env_fns)
        self.nlump = nlump
        self.lump_stride = nenvs // self.nlump
        self.envs = [
            VecEnv(env_fns[l * self.lump_stride:(l + 1) * self.lump_stride],
                   spaces=[self.ob_space, self.ac_space])
            for l in range(self.nlump)
        ]

        self.rollout = Rollout(hps=self.hps,
                               ob_space=self.ob_space,
                               ac_space=self.ac_space,
                               nenvs=nenvs,
                               nsteps_per_seg=self.nsteps_per_seg,
                               nsegs_per_env=self.nsegs_per_env,
                               nlumps=self.nlump,
                               envs=self.envs,
                               policy=self.stochpol,
                               int_rew_coeff=self.int_coeff,
                               ext_rew_coeff=self.ext_coeff,
                               record_rollouts=self.use_recorder,
                               dynamics=dynamics)

        self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32)
        self.buf_advs_int = np.zeros((nenvs, self.rollout.nsteps), np.float32)
        self.buf_advs_ext = np.zeros((nenvs, self.rollout.nsteps), np.float32)

        self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32)
        self.buf_rets_int = np.zeros((nenvs, self.rollout.nsteps), np.float32)
        self.buf_rets_ext = np.zeros((nenvs, self.rollout.nsteps), np.float32)

        if self.normrew:
            self.rff = RewardForwardFilter(self.gamma)
            self.rff_rms = RunningMeanStd()

        self.step_count = 0
        self.t_last_update = time.time()
        self.t_start = time.time()
Esempio n. 3
0
 def __init__(self, params):
     print("----- Initiating ----- ")
     print("----- step 1 configure logger")
     configure_logger(params['output_dir'])
     log('Parameters {}'.format(params))
     self.params = params
     print("----- step 2 load pre-collected things")
     self.binding = load_bindings(params['rom_file_path'])
     self.max_word_length = self.binding['max_word_length']
     self.sp = spm.SentencePieceProcessor()
     self.sp.Load(params['spm_file'])
     print("----- step 3 build KGA2CEnv")
     kg_env = KGA2CEnv(params['rom_file_path'],
                       params['seed'],
                       self.sp,
                       params['tsv_file'],
                       step_limit=params['reset_steps'],
                       stuck_steps=params['stuck_steps'],
                       gat=params['gat'])
     self.vec_env = VecEnv(params['batch_size'], kg_env,
                           params['openie_path'])
     print("----- step 4 build FrotzEnv and templace generator")
     env = FrotzEnv(params['rom_file_path'])
     self.vocab_act, self.vocab_act_rev = load_vocab(env)
     self.template_generator = TemplateActionGenerator(self.binding)
     print("----- step 5 build kga2c model")
     self.model = KGA2C(params,
                        self.template_generator.templates,
                        self.max_word_length,
                        self.vocab_act,
                        self.vocab_act_rev,
                        len(self.sp),
                        gat=self.params['gat']).cuda()
     if params['preload_weights']:
         print("load pretrained")
         self.model = torch.load(self.params['preload_weights'])['model']
     else:
         print("train from scratch")
     print("----- step 6 set training parameters")
     self.batch_size = params['batch_size']
     self.optimizer = optim.Adam(self.model.parameters(), lr=params['lr'])
     self.loss_fn1 = nn.BCELoss()
     self.loss_fn2 = nn.BCEWithLogitsLoss()
     self.loss_fn3 = nn.MSELoss()
     print("----- Init finished! ----- ")
Esempio n. 4
0
    def start_interaction(self, env_fns, dynamics, nlump=2):
        self.loss_names, self._losses = zip(*list(self.to_report.items()))
        
        params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
        self.caculate_number_parameters(params)

        flow_params = [v for v in params if 'flow' in v.name]
        other_params = [v for v in params if 'flow' not in v.name]

        print('length of flow params: ', len(flow_params))
        print('length of agent params: ', len(other_params))
        
        trainer_flow = tf.train.AdamOptimizer(learning_rate=self.flow_lr)
        trainer_agent = tf.train.AdamOptimizer(learning_rate=self.ph_lr)

        grads = tf.gradients(self.total_loss, flow_params + other_params)
        grads_flow = grads[:len(flow_params)]
        grads_agent = grads[len(flow_params):]

        train_flow = trainer_flow.apply_gradients(zip(grads_flow, flow_params))
        train_agent = trainer_agent.apply_gradients(zip(grads_agent, other_params))

        self._train = tf.group(train_flow, train_agent)

        if MPI.COMM_WORLD.Get_rank() == 0:
            getsess().run(tf.variables_initializer(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)))
        bcast_tf_vars_from_root(getsess(), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))

        self.all_visited_rooms = []
        self.all_scores = []
        self.nenvs = nenvs = len(env_fns)
        self.nlump = nlump
        self.lump_stride = nenvs // self.nlump
        self.envs = [
            VecEnv(env_fns[l * self.lump_stride: (l + 1) * self.lump_stride], spaces=[self.ob_space, self.ac_space]) for
            l in range(self.nlump)]

        self.rollout = Rollout(ob_space=self.ob_space, ac_space=self.ac_space, nenvs=nenvs,
                               nsteps_per_seg=self.nsteps_per_seg,
                               nsegs_per_env=self.nsegs_per_env, nlumps=self.nlump,
                               envs=self.envs,
                               policy=self.stochpol,
                               int_rew_coeff=self.int_coeff,
                               ext_rew_coeff=self.ext_coeff,
                               record_rollouts=self.use_recorder,
                               dynamics=dynamics)

        self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32)
        self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32)

        if self.normrew:
            self.rff = RewardForwardFilter(self.gamma)
            self.rff_rms = RunningMeanStd()

        self.step_count = 0
        self.t_last_update = time.time()
        self.t_start = time.time()
Esempio n. 5
0
    def start_interaction(self, env_fns, dynamics, nlump=2):
        # 在开始与环境交互时定义变量和计算图, 初始化 rollout 类
        self.loss_names, self._losses = zip(*list(self.to_report.items()))

        # 定义损失、梯度和反向传播.  在训练时调用 sess.run(self._train) 进行迭代
        params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
        params_dvae = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="dvae_reward")
        print("total params:", np.sum([np.prod(v.get_shape().as_list()) for v in params]))      # 6629459
        print("dvae params:", np.sum([np.prod(v.get_shape().as_list()) for v in params_dvae]))  # 2726144
        if MPI.COMM_WORLD.Get_size() > 1:
            trainer = MpiAdamOptimizer(learning_rate=self.ph_lr, comm=MPI.COMM_WORLD)
        else:
            trainer = tf.train.AdamOptimizer(learning_rate=self.ph_lr)
        gradsandvars = trainer.compute_gradients(self.total_loss, params)
        self._train = trainer.apply_gradients(gradsandvars)

        # add bai.  单独计算 DVAE 的梯度
        gradsandvars_dvae = trainer.compute_gradients(self.dynamics_loss, params_dvae)
        self._train_dvae = trainer.apply_gradients(gradsandvars_dvae)

        if MPI.COMM_WORLD.Get_rank() == 0:
            getsess().run(tf.variables_initializer(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)))
        bcast_tf_vars_from_root(getsess(), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))

        self.all_visited_rooms = []
        self.all_scores = []
        self.nenvs = nenvs = len(env_fns)        # 默认 128
        self.nlump = nlump                       # 默认 1
        self.lump_stride = nenvs // self.nlump   # 128/1=128
        self.envs = [
            VecEnv(env_fns[l * self.lump_stride: (l + 1) * self.lump_stride], spaces=[self.ob_space, self.ac_space]) for
            l in range(self.nlump)]

        # 该类在 rollouts.py 中定义
        self.rollout = Rollout(ob_space=self.ob_space, ac_space=self.ac_space, nenvs=nenvs,
                               nsteps_per_seg=self.nsteps_per_seg,
                               nsegs_per_env=self.nsegs_per_env, nlumps=self.nlump,
                               envs=self.envs,
                               policy=self.stochpol,
                               int_rew_coeff=self.int_coeff,
                               ext_rew_coeff=self.ext_coeff,
                               record_rollouts=self.use_recorder,
                               dynamics=dynamics)

        # 环境数(线程数), 周期T
        self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32)
        self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32)

        if self.normrew:
            self.rff = RewardForwardFilter(self.gamma)
            self.rff_rms = RunningMeanStd()

        self.step_count = 0
        self.t_last_update = time.time()
        self.t_start = time.time()
def main():
    # assert jericho.__version__ == '2.1.0', "This code is designed to be run with Jericho version 2.1.0."
    args = parse_args()
    print(args)
    configure_logger(args.output_dir)
    start_redis()
    agent = DRRN_Agent(args)
    env = JerichoEnv(args.rom_path, args.seed, args.env_step_limit)
    envs = VecEnv(args.num_envs, env)
    env.create()  # Create the environment for evaluation
    train(agent, env, envs, args.max_steps, args.update_freq, args.eval_freq,
          args.checkpoint_freq, args.log_freq)
Esempio n. 7
0
    def start_interaction(self, env_fns, dynamics, nlump=2):
        param_list = self.stochpol.param_list + self.dynamics.param_list + self.dynamics.auxiliary_task.param_list  # copy a link, not deepcopy.
        self.optimizer = torch.optim.Adam(param_list, lr=self.lr)
        self.optimizer.zero_grad()

        self.all_visited_rooms = []
        self.all_scores = []
        self.nenvs = nenvs = len(env_fns)
        self.nlump = nlump
        self.lump_stride = nenvs // self.nlump
        self.envs = [
            VecEnv(env_fns[l * self.lump_stride:(l + 1) * self.lump_stride],
                   spaces=[self.ob_space, self.ac_space])
            for l in range(self.nlump)
        ]

        self.rollout = Rollout(ob_space=self.ob_space,
                               ac_space=self.ac_space,
                               nenvs=nenvs,
                               nsteps_per_seg=self.nsteps_per_seg,
                               nsegs_per_env=self.nsegs_per_env,
                               nlumps=self.nlump,
                               envs=self.envs,
                               policy=self.stochpol,
                               int_rew_coeff=self.int_coeff,
                               ext_rew_coeff=self.ext_coeff,
                               record_rollouts=self.use_recorder,
                               dynamics=dynamics)

        self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32)
        self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32)

        if self.normrew:
            self.rff = RewardForwardFilter(self.gamma)
            self.rff_rms = RunningMeanStd()

        self.step_count = 0
        self.t_last_update = time.time()
        self.t_start = time.time()
Esempio n. 8
0
                    default=11,
                    help='number of events to record (default: 11)')
args = parser.parse_args()

try:
    os.makedirs(args.log_dir)
except OSError:
    pass

cig = "cig" in args.config_path
global envs
es = [
    make_env(i, args.config_path, visual=args.visual, cig=cig)
    for i in range(args.num_processes)
]
envs = VecEnv([es[i] for i in range(args.num_processes)])

scenario = args.config_path.split("/")[1].split(".")[0]
exp_name = scenario + ("_event" if args.roe else "")

print("Scenario: " + scenario)

actor_critic = torch.load(
    "/Users/git/rarity-of-events/models/3/783a542d-71cf-11e9-8daf-005056a54761.pt"
)
# actor_critic = torch.load("/Users/git/rarity-of-events/models/2/f399ade2-6d52-11e9-8dad-005056a54761.pt")

print("Model loaded")
actor_critic.eval()

obs_shape = envs.observation_space_shape
Esempio n. 9
0
from envs import make_visual_env, make_env
from vec_env import VecEnv
from time import sleep
from random import choice
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--vis', type=int, default=0)
args = parser.parse_args()
num_envs = 1

if args.vis:
    envs = VecEnv([
        make_visual_env('./scenarios/deathmatch_maze.cfg')
        for i in range(num_envs)
    ])
else:
    envs = VecEnv([
        make_env(0, './scenarios/deathmatch_maze.cfg') for i in range(num_envs)
    ])

# Define some actions. Each list entry corresponds to declared buttons:
# MOVE_LEFT, MOVE_RIGHT, ATTACK
# 5 more combinations are naturally possible but only 3 are included for transparency when watching.
# actions = [[True, False, False], [False, True, False], [False, False, True]]
actions = range(envs.action_space_shape)
episode_num = 0

while True:
    print('Episode #', episode_num)
    for j in range(1000):
Esempio n. 10
0
def main():
    print("###############################################################")
    print("#################### VISDOOM LEARNER START ####################")
    print("###############################################################")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    global envs
    envs = VecEnv(
        [make_env(i, args.config_path) for i in range(args.num_processes)],
        logging=True,
        log_dir=args.log_dir)

    obs_shape = envs.observation_space_shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if args.algo == 'a2c' or args.algo == 'acktr':
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space_shape)
    elif args.algo == 'a2t':
        source_models = []
        files = glob.glob(os.path.join(args.source_models_path, '*.pt'))
        for file in files:
            print(file, 'loading model...')
            source_models.append(torch.load(file))
        actor_critic = A2TPolicy(obs_shape[0], envs.action_space_shape,
                                 source_models)
    elif args.algo == 'resnet':
        # args.num_stack = 3
        actor_critic = ResnetPolicy(obs_shape[0], envs.action_space_shape)

    action_shape = 1

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c' or args.algo == 'resnet':
        optimizer = optim.RMSprop(actor_critic.parameters(),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
    elif args.algo == 'a2t':
        a2t_params = [p for p in actor_critic.parameters() if p.requires_grad]
        optimizer = optim.RMSprop(a2t_params,
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space_shape)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space_shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()
            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            # print ('Actions:', cpu_actions, 'Rewards:', reward)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(step, current_obs, action.data, value.data, reward,
                            masks)

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True))[0].data

        if hasattr(actor_critic, 'obs_filter'):
            actor_critic.obs_filter.update(rollouts.observations[:-1].view(
                -1, *obs_shape))

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        if args.algo in ['a2c', 'acktr', 'a2t', 'resnet']:
            values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(
                Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                Variable(rollouts.actions.view(-1, action_shape)))

            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps,
                                                     args.num_processes, 1)

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) *
                            action_log_probs).mean()

            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
                # Sampled fisher, see Martens 2014
                actor_critic.zero_grad()
                pg_fisher_loss = -action_log_probs.mean()

                value_noise = Variable(torch.randn(values.size()))
                if args.cuda:
                    value_noise = value_noise.cuda()

                sample_values = values + value_noise
                vf_fisher_loss = -(values -
                                   Variable(sample_values.data)).pow(2).mean()

                fisher_loss = pg_fisher_loss + vf_fisher_loss
                optimizer.acc_stats = True
                fisher_loss.backward(retain_graph=True)
                optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss -
             dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c' or args.algo == 'resnet':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)
            elif args.algo == 'a2t':
                nn.utils.clip_grad_norm(a2t_params, args.max_grad_norm)

            optimizer.step()

        rollouts.observations[0].copy_(rollouts.observations[-1])

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            envs.log()
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        -dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, 'VizDoom', args.algo)
            except IOError:
                pass
    envs.close()
    time.sleep(5)
Esempio n. 11
0
def main():
    print("###############################################################")
    print("#################### VIZDOOM LEARNER START ####################")
    print("###############################################################")

    save_path = os.path.join(args.save_dir, "a2c")
    num_updates = int(args.num_frames) // args.num_steps // args.num_processes
    reward_name = ""
    if args.roe:
        reward_name = "_event"
    scenario_name = args.config_path.split("/")[1].split(".")[0]
    print("############### " + scenario_name + " ###############")
    log_file_name = "vizdoom_" + scenario_name + reward_name + "_" + str(
        args.agent_id) + ".log"
    log_event_file_name = "vizdoom_" + scenario_name + reward_name + "_" + str(
        args.agent_id) + ".eventlog"
    log_event_reward_file_name = "vizdoom_" + scenario_name + reward_name + "_" + str(
        args.agent_id) + ".eventrewardlog"
    start_updates = 0
    start_step = 0
    best_final_rewards = -1000000.0

    os.environ['OMP_NUM_THREADS'] = '1'

    global envs
    es = [
        make_env(i, args.config_path, visual=args.visual, bots=args.bots)
        for i in range(args.num_processes)
    ]
    envs = VecEnv([es[i] for i in range(args.num_processes)])

    obs_shape = envs.observation_space_shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if args.resume:
        actor_critic = torch.load(
            os.path.join(save_path, log_file_name + ".pt"))
        filename = glob.glob(os.path.join(args.log_dir, log_file_name))[0]
        if args.roe:
            e
            # TODO: Load event buffer
        with open(filename) as file:
            lines = file.readlines()
            start_updates = (int)(lines[-1].strip().split(",")[0])
            start_steps = (int)(lines[-1].strip().split(",")[1])
            num_updates += start_updates
    else:
        if not args.debug:
            try:
                os.makedirs(args.log_dir)
            except OSError:
                files = glob.glob(os.path.join(args.log_dir, log_file_name))
                for f in files:
                    os.remove(f)
                with open(log_file_name, "w") as myfile:
                    myfile.write("")
                files = glob.glob(
                    os.path.join(args.log_dir, log_event_file_name))
                for f in files:
                    os.remove(f)
                with open(log_event_file_name, "w") as myfile:
                    myfile.write("")
                files = glob.glob(
                    os.path.join(args.log_dir, log_event_reward_file_name))
                for f in files:
                    os.remove(f)
                with open(log_event_reward_file_name, "w") as myfile:
                    myfile.write("")
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space_shape)

    action_shape = 1

    if args.cuda:
        actor_critic.cuda()

    optimizer = optim.RMSprop(actor_critic.parameters(),
                              args.lr,
                              eps=args.eps,
                              alpha=args.alpha)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space_shape)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space_shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)
    last_game_vars = []
    for i in range(args.num_processes):
        last_game_vars.append(np.zeros(args.num_events))

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])
    episode_intrinsic_rewards = torch.zeros([args.num_processes, 1])
    final_intrinsic_rewards = torch.zeros([args.num_processes, 1])
    episode_events = torch.zeros([args.num_processes, args.num_events])
    final_events = torch.zeros([args.num_processes, args.num_events])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    # Create event buffer
    if args.qd:
        event_buffer = EventBufferSQLProxy(args.num_events, args.capacity,
                                           args.exp_id, args.agent_id)
    elif not args.resume:
        event_buffer = EventBuffer(args.num_events, args.capacity)
    else:
        event_buffer = pickle.load(
            open(log_file_name + "_event_buffer_temp.p", "rb"))

    event_episode_rewards = []

    start = time.time()
    for j in np.arange(start_updates, num_updates):
        for step in range(args.num_steps):

            value, action = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()
            obs, reward, done, info, events = envs.step(cpu_actions)
            intrinsic_reward = []

            # Fix broken rewards - upscale
            for i in range(len(reward)):
                if scenario_name in ["deathmatch", "my_way_home"]:
                    reward[i] *= 100
                if scenario_name == "deadly_corridor":
                    reward[i] = 1 if events[i][2] >= 1 else 0

            for e in events:
                if args.roe:
                    intrinsic_reward.append(event_buffer.intrinsic_reward(e))
                else:
                    r = reward[len(intrinsic_reward)]
                    intrinsic_reward.append(r)

            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            intrinsic_reward = torch.from_numpy(
                np.expand_dims(np.stack(intrinsic_reward), 1)).float()
            #events = torch.from_numpy(np.expand_dims(np.stack(events), args.num_events)).float()
            events = torch.from_numpy(events).float()
            episode_rewards += reward
            episode_intrinsic_rewards += intrinsic_reward
            episode_events += events

            # Event stats
            event_rewards = []
            for ei in range(0, args.num_events):
                ev = np.zeros(args.num_events)
                ev[ei] = 1
                er = event_buffer.intrinsic_reward(ev)
                event_rewards.append(er)

            event_episode_rewards.append(event_rewards)

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_intrinsic_rewards *= masks
            final_events *= masks
            final_rewards += (1 - masks) * episode_rewards
            final_intrinsic_rewards += (1 - masks) * episode_intrinsic_rewards
            final_events += (1 - masks) * episode_events

            for i in range(args.num_processes):
                if done[i]:
                    event_buffer.record_events(np.copy(
                        final_events[i].numpy()),
                                               frame=j * args.num_steps)

            episode_rewards *= masks
            episode_intrinsic_rewards *= masks
            episode_events *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)

            rollouts.insert(step, current_obs, action.data, value.data,
                            intrinsic_reward, masks)

        final_episode_reward = np.mean(event_episode_rewards, axis=0)
        event_episode_rewards = []

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True))[0].data

        if hasattr(actor_critic, 'obs_filter'):
            actor_critic.obs_filter.update(rollouts.observations[:-1].view(
                -1, *obs_shape))

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(
            Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
            Variable(rollouts.actions.view(-1, action_shape)))

        values = values.view(args.num_steps, args.num_processes, 1)
        action_log_probs = action_log_probs.view(args.num_steps,
                                                 args.num_processes, 1)
        advantages = Variable(rollouts.returns[:-1]) - values
        value_loss = advantages.pow(2).mean()

        action_loss = -(Variable(advantages.data) * action_log_probs).mean()

        optimizer.zero_grad()
        (value_loss * args.value_loss_coef + action_loss -
         dist_entropy * args.entropy_coef).backward()

        nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)

        optimizer.step()

        rollouts.observations[0].copy_(rollouts.observations[-1])

        if final_rewards.mean() > best_final_rewards and not args.debug:
            try:
                os.makedirs(save_path)
            except OSError:
                pass
            best_final_rewards = final_rewards.mean()
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
            torch.save(
                save_model,
                os.path.join(save_path,
                             log_file_name.split(".log")[0] + ".pt"))

        if j % args.save_interval == 0 and args.save_dir != "" and not args.debug:
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
            torch.save(save_model,
                       os.path.join(save_path, log_file_name + "_temp.pt"))
            if isinstance(event_buffer, EventBuffer):
                pickle.dump(event_buffer,
                            open(log_file_name + "_event_buffer_temp.p", "wb"))

        if j % args.log_interval == 0:

            envs.log()
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            log = "Updates {}, num timesteps {}, FPS {}, mean/max reward {:.5f}/{:.5f}, mean/max intrinsic reward {:.5f}/{:.5f}"\
                .format(j, total_num_steps,
                            int(total_num_steps / (end - start)),
                            final_rewards.mean(),
                            final_rewards.max(),
                            final_intrinsic_rewards.mean(),
                            final_intrinsic_rewards.max()
                        )
            log_to_file = "{}, {}, {:.5f}, {:.5f}, {:.5f}, {:.5f}\n" \
                .format(j, total_num_steps,
                        final_rewards.mean(),
                        final_rewards.std(),
                        final_intrinsic_rewards.mean(),
                        final_intrinsic_rewards.std())
            log_to_event_file = ','.join(
                map(str,
                    event_buffer.get_event_mean().tolist())) + "\n"
            log_to_event_reward_file = ','.join(
                map(str,
                    event_buffer.get_event_rewards().tolist())) + "\n"
            print(log)
            print(log_to_event_file)

            # Save to files
            with open(log_file_name, "a") as myfile:
                myfile.write(log_to_file)
            with open(log_event_file_name, "a") as myfile:
                myfile.write(str(total_num_steps) + "," + log_to_event_file)
            with open(log_event_reward_file_name, "a") as myfile:
                myfile.write(
                    str(total_num_steps) + "," + log_to_event_reward_file)

    envs.close()
    time.sleep(5)
Esempio n. 12
0
def main():
    print("###############################################################")
    print("#################### VIZDOOM LEARNER START ####################")
    print("###############################################################")

    save_path = os.path.join(args.save_dir, str(args.exp_id))
    log_path = os.path.join(args.log_dir, str(args.exp_id))
    num_updates = int(args.num_frames) // args.num_steps // args.num_processes
    reward_name = ""
    if args.roe:
        reward_name = "_event"
    scenario_name = args.config_path.split("/")[1].split(".")[0]
    print("############### " + scenario_name + " ###############")
    log_file_name = "vizdoom_" + scenario_name + reward_name + "_" + str(
        args.exp_id) + "_" + str(args.agent_id) + ".log"
    #log_event_file_name = "vizdoom_" + scenario_name + reward_name + "_" + str(args.exp_id) + "_" + str(args.agent_id) + ".eventlog"
    #log_event_reward_file_name = "vizdoom_" + scenario_name + reward_name + "_" + str(args.exp_id) + "_" + str(args.agent_id) + ".eventrewardlog"
    start_updates = 0
    start_step = 0
    best_final_rewards = -1000000.0

    os.environ['OMP_NUM_THREADS'] = '1'

    cig = "cig" in args.config_path
    global envs
    es = [
        make_env(i, args.config_path, visual=args.visual, cig=cig)
        for i in range(args.num_processes)
    ]
    envs = VecEnv([es[i] for i in range(args.num_processes)])

    obs_shape = envs.observation_space_shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if args.resume:
        actor_critic = torch.load(
            os.path.join(save_path, f"{args.agent_id}.pt"))
        filename = glob.glob(os.path.join(log_path, log_file_name))[0]
        with open(filename) as file:
            lines = file.readlines()
            start_updates = (int)(lines[-1].strip().split(",")[0])
            start_steps = (int)(lines[-1].strip().split(",")[1])
            num_updates += start_updates
    else:
        try:
            os.makedirs(save_path)
        except OSError:
            pass
        try:
            os.makedirs(log_path)
        except OSError:
            files = glob.glob(os.path.join(args.log_dir, log_file_name))
            for f in files:
                os.remove(f)
            #with open(log_file_name, "w") as myfile:
            #    myfile.write("")
            #files = glob.glob(os.path.join(args.log_dir, log_event_file_name))
            #for f in files:
            #    os.remove(f)
            #with open(log_event_file_name, "w") as myfile:
            #    myfile.write("")
            #files = glob.glob(os.path.join(args.log_dir, log_event_reward_file_name))
            #for f in files:
            #    os.remove(f)
            #with open(log_event_reward_file_name, "w") as myfile:
            #    myfile.write("")
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space_shape)

    action_shape = 1

    if args.cuda:
        actor_critic.cuda()

    optimizer = optim.RMSprop(actor_critic.parameters(),
                              args.lr,
                              eps=args.eps,
                              alpha=args.alpha)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space_shape)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space_shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)
    last_game_vars = []
    for i in range(args.num_processes):
        last_game_vars.append(np.zeros(args.num_events))

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])
    episode_intrinsic_rewards = torch.zeros([args.num_processes, 1])
    final_intrinsic_rewards = torch.zeros([args.num_processes, 1])
    episode_events = torch.zeros([args.num_processes, args.num_events])
    final_events = torch.zeros([args.num_processes, args.num_events])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    def mean_distance_to_nearest_neighbor(elite_events):
        d = []
        nearest = None
        for a in range(len(elite_events)):
            for b in range(len(elite_events)):
                if a != b:
                    elite_a = elite_events[a]
                    elite_b = elite_events[b]
                    dist = np.linalg.norm(elite_a - elite_b)
                    if nearest is None or dist < nearest:
                        nearest = dist
            if nearest is not None:
                d.append(nearest)
            nearest = None
        return np.mean(d)

    def distance_to_nearest_neighbor(elite_events, events):
        nearest = None
        for elite_a in elite_events:
            dist = np.linalg.norm(elite_a - events)
            if nearest is None or dist < nearest:
                nearest = dist
        return nearest

    def add_to_archive(frame, episode_length):
        #print("Final rewards: ", final_rewards.numpy())
        fitness = final_rewards.numpy().mean()
        #print("raw: ", final_events.numpy())
        behavior = final_events.numpy().mean(axis=0)
        #print("Fitness:", fitness)
        #print("Behavior:", behavior)
        neighbors = event_buffer.get_neighbors(behavior, args.niche_divs,
                                               episode_length)

        add = len(neighbors) == 0
        for neighbor in neighbors:
            if fitness > neighbor.fitness:
                add = True
            else:
                add = False
                break

        if add:
            if len(neighbors) > 0:
                event_buffer.remove_elites(neighbors)
                #print(f"- Removing elites {[neighbor.elite_id for neighbor in neighbors]}")
            for neighbor in neighbors:
                try:
                    #print(f"- Deleting model {neighbor.elite_id}")
                    os.remove(
                        os.path.join(save_path, f"{neighbor.elite_id}.pt"))
                    #print("Successfully deleted model with id : ", neighbor.elite_id)
                except:
                    print("Error while deleting model with id : ",
                          neighbor.elite_id)
            name = str(uuid.uuid1())
            #print("Adding elite")
            event_buffer.add_elite(name, behavior, fitness, frame,
                                   episode_length)
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
            torch.save(save_model, os.path.join(save_path, f"{name}.pt"))

    # Create event buffer
    event_buffer = EventBufferSQLProxy(args.num_events,
                                       args.capacity,
                                       args.exp_id,
                                       args.agent_id,
                                       qd=args.qd,
                                       per_step=args.per_step)

    event_episode_rewards = []

    episode_finished = np.zeros(args.num_processes)

    start = time.time()
    for j in np.arange(start_updates, num_updates):
        for step in range(args.num_steps):

            value, action = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()
            obs, reward, done, info, events = envs.step(cpu_actions)
            intrinsic_reward = []

            # Fix broken rewards - upscale
            for i in range(len(reward)):
                if scenario_name in ["deathmatch", "my_way_home"]:
                    reward[i] *= 100
                if scenario_name == "deadly_corridor":
                    reward[i] = 1 if events[i][2] >= 1 else 0

            for e in events:
                if args.roe:
                    ir = event_buffer.intrinsic_reward(e)
                    if args.per_step:
                        ir = ir / 4200
                    intrinsic_reward.append(ir)
                else:
                    r = reward[len(intrinsic_reward)]
                    intrinsic_reward.append(r)

            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            intrinsic_reward = torch.from_numpy(
                np.expand_dims(np.stack(intrinsic_reward), 1)).float()
            #events = torch.from_numpy(np.expand_dims(np.stack(events), args.num_events)).float()
            events = torch.from_numpy(events).float()
            episode_rewards += reward
            episode_intrinsic_rewards += intrinsic_reward
            episode_events += events

            # Event stats
            '''
            event_rewards = []
            for ei in range(0,args.num_events):
                ev = np.zeros(args.num_events)
                ev[ei] = 1
                er = event_buffer.intrinsic_reward(ev)
                if args.per_step:
                    er = er / 4200
                er = event_buffer.intrinsic_reward(ev)
                event_rewards.append(er)

            event_episode_rewards.append(event_rewards)
            '''

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_intrinsic_rewards *= masks
            final_events *= masks
            final_rewards += (1 - masks) * episode_rewards
            final_intrinsic_rewards += (1 - masks) * episode_intrinsic_rewards
            final_events += (1 - masks) * episode_events

            for i in range(args.num_processes):
                if done[i]:
                    #event_buffer.record_events(np.copy(final_events[i].numpy()), frame=j*args.num_steps*args.num_processes)
                    episode_length = (step +
                                      j * args.num_steps) - episode_finished[i]
                    episode_finished[i] = episode_length + episode_finished[i]
                    add_to_archive(
                        step * args.num_processes +
                        j * args.num_steps * args.num_processes,
                        episode_length)

            episode_rewards *= masks
            episode_intrinsic_rewards *= masks
            episode_events *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)

            rollouts.insert(step, current_obs, action.data, value.data,
                            intrinsic_reward, masks)

        #final_episode_reward = np.mean(event_episode_rewards, axis=0)
        #event_episode_rewards = []

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True))[0].data

        if hasattr(actor_critic, 'obs_filter'):
            actor_critic.obs_filter.update(rollouts.observations[:-1].view(
                -1, *obs_shape))

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(
            Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
            Variable(rollouts.actions.view(-1, action_shape)))

        values = values.view(args.num_steps, args.num_processes, 1)
        action_log_probs = action_log_probs.view(args.num_steps,
                                                 args.num_processes, 1)
        advantages = Variable(rollouts.returns[:-1]) - values
        value_loss = advantages.pow(2).mean()

        action_loss = -(Variable(advantages.data) * action_log_probs).mean()

        optimizer.zero_grad()
        (value_loss * args.value_loss_coef + action_loss -
         dist_entropy * args.entropy_coef).backward()

        nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)

        optimizer.step()

        rollouts.observations[0].copy_(rollouts.observations[-1])

        if j % args.log_interval == 0:

            envs.log()
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            log = "Updates {}, num timesteps {}, FPS {}, mean/max reward {:.5f}/{:.5f}, mean/max intrinsic reward {:.5f}/{:.5f}"\
                .format(j, total_num_steps,
                            int(total_num_steps / (end - start)),
                            final_rewards.mean(),
                            final_rewards.max(),
                            final_intrinsic_rewards.mean(),
                            final_intrinsic_rewards.max()
                        )

            log_to_file = "{}, {}, {:.5f}, {:.5f}, {:.5f}, {:.5f}\n" \
                .format(j, total_num_steps,
                        final_rewards.mean(),
                        final_rewards.std(),
                        final_intrinsic_rewards.mean(),
                        final_intrinsic_rewards.std())

            with open(os.path.join(log_path, log_file_name), "a") as myfile:
                myfile.write(log_to_file)

            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
            torch.save(save_model,
                       os.path.join(save_path, f"{args.agent_id}.pt"))

            print(log)

    envs.close()
    time.sleep(5)
Esempio n. 13
0
class KGA2CTrainer(object):
    '''
    KGA2C main class.
    '''
    def __init__(self, params):
        print("----- Initiating ----- ")
        print("----- step 1 configure logger")
        configure_logger(params['output_dir'])
        log('Parameters {}'.format(params))
        self.params = params
        print("----- step 2 load pre-collected things")
        self.binding = load_bindings(params['rom_file_path'])
        self.max_word_length = self.binding['max_word_length']
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(params['spm_file'])
        print("----- step 3 build KGA2CEnv")
        kg_env = KGA2CEnv(params['rom_file_path'],
                          params['seed'],
                          self.sp,
                          params['tsv_file'],
                          step_limit=params['reset_steps'],
                          stuck_steps=params['stuck_steps'],
                          gat=params['gat'])
        self.vec_env = VecEnv(params['batch_size'], kg_env,
                              params['openie_path'])
        print("----- step 4 build FrotzEnv and templace generator")
        env = FrotzEnv(params['rom_file_path'])
        self.vocab_act, self.vocab_act_rev = load_vocab(env)
        self.template_generator = TemplateActionGenerator(self.binding)
        print("----- step 5 build kga2c model")
        self.model = KGA2C(params,
                           self.template_generator.templates,
                           self.max_word_length,
                           self.vocab_act,
                           self.vocab_act_rev,
                           len(self.sp),
                           gat=self.params['gat']).cuda()
        if params['preload_weights']:
            print("load pretrained")
            self.model = torch.load(self.params['preload_weights'])['model']
        else:
            print("train from scratch")
        print("----- step 6 set training parameters")
        self.batch_size = params['batch_size']
        self.optimizer = optim.Adam(self.model.parameters(), lr=params['lr'])
        self.loss_fn1 = nn.BCELoss()
        self.loss_fn2 = nn.BCEWithLogitsLoss()
        self.loss_fn3 = nn.MSELoss()
        print("----- Init finished! ----- ")

    def generate_targets(self, admissible, objs):
        '''
        Generates ground-truth targets for admissible actions.
        :param admissible: List-of-lists of admissible actions. Batch_size x Admissible
        :param objs: List-of-lists of interactive objects. Batch_size x Objs
        :returns: template targets and object target tensors
        '''
        tmpl_target = []
        obj_targets = []
        for adm in admissible:
            obj_t = set()
            cur_t = [0] * len(self.template_generator.templates)
            for a in adm:
                cur_t[a.template_id] = 1
                obj_t.update(a.obj_ids)
            tmpl_target.append(cur_t)
            obj_targets.append(list(obj_t))
        tmpl_target_tt = torch.FloatTensor(tmpl_target).cuda()
        # Note: Adjusted to use the objects in the admissible actions only
        object_mask_target = []
        for objl in obj_targets:  # in objs
            cur_objt = [0] * len(self.vocab_act)
            for o in objl:
                cur_objt[o] = 1
            object_mask_target.append([[cur_objt], [cur_objt]])
        obj_target_tt = torch.FloatTensor(object_mask_target).squeeze().cuda()
        return tmpl_target_tt, obj_target_tt

    def generate_graph_mask(self, graph_infos):
        assert len(graph_infos) == self.batch_size
        mask_all = []
        for graph_info in graph_infos:
            mask = [0] * len(self.vocab_act.keys())
            # Case 1 (default): KG as mask
            if self.params['masking'] == 'kg':
                graph_state = graph_info.graph_state  # Full KG as mask --> same as KG-A2C
                # graph_state = graph_info.graph_state_5_mask # sub_KG_5 as mask, disabled
                ents = set()
                # Obtain entities ---> maybe I can perform graph pooling before this step
                for u, v in graph_state.edges:
                    ents.add(u)
                    ents.add(v)
                # Build mask: only use those related to entities
                for ent in ents:
                    for ent_word in ent.split():
                        if ent_word[:self.
                                    max_word_length] in self.vocab_act_rev:
                            idx = self.vocab_act_rev[
                                ent_word[:self.max_word_length]]
                            mask[idx] = 1
            # Case 2: interactive objects ground truth as the mask.
            elif self.params['masking'] == 'interactive':
                for o in graph_info.objs:
                    o = o[:self.max_word_length]
                    if o in self.vocab_act_rev.keys() and o != '':
                        mask[self.vocab_act_rev[o]] = 1
            # Case 3: no mask.
            elif self.params['masking'] == 'none':
                mask = [1] * len(self.vocab_act.keys())
            else:
                assert False, 'Unrecognized masking {}'.format(
                    self.params['masking'])
            mask_all.append(mask)
        return torch.BoolTensor(mask_all).cuda().detach()

    def discount_reward(self, transitions, last_values):
        returns, advantages = [], []
        R = last_values.data
        for t in reversed(range(len(transitions))):
            _, _, values, rewards, done_masks, _, _, _, _, _, _ = transitions[
                t]
            R = rewards + self.params['gamma'] * R * done_masks
            adv = R - values
            returns.append(R)
            advantages.append(adv)
        return returns[::-1], advantages[::-1]

    def train(self, max_steps):
        print("=== === === start training!!! === === ===")
        start = time.time()
        transitions = []
        obs, infos, graph_infos = self.vec_env.reset()
        for step in range(1, max_steps + 1):
            # Step 1: build model inputs
            tb.logkv('Step', step)
            obs_reps = np.array([g.ob_rep for g in graph_infos])
            scores = [info['score'] for info in infos]
            graph_mask_tt = self.generate_graph_mask(graph_infos)
            graph_state_reps = [g.graph_state_rep for g in graph_infos]
            graph_rep_1 = [
                g.graph_state_rep_1_connectivity for g in graph_infos
            ]
            graph_rep_2 = [g.graph_state_rep_2_roomitem for g in graph_infos]
            graph_rep_3 = [g.graph_state_rep_3_youritem for g in graph_infos]
            graph_rep_4 = [g.graph_state_rep_4_otherroom for g in graph_infos]
            # Step 2: predict probs, actual items
            tmpl_pred_tt, obj_pred_tt, dec_obj_tt, dec_tmpl_tt, value, dec_steps = self.model(
                obs_reps, scores, graph_state_reps, graph_rep_1, graph_rep_2,
                graph_rep_3, graph_rep_4, graph_mask_tt)
            tb.logkv_mean('Value', value.mean().item())
            # Step 3: Log the predictions and ground truth values
            topk_tmpl_probs, topk_tmpl_idxs = F.softmax(
                tmpl_pred_tt[0]).topk(5)
            topk_tmpls = [
                self.template_generator.templates[t]
                for t in topk_tmpl_idxs.tolist()
            ]
            tmpl_pred_str = ', '.join([
                '{} {:.3f}'.format(tmpl, prob)
                for tmpl, prob in zip(topk_tmpls, topk_tmpl_probs.tolist())
            ])
            # Step 4: Generate the ground truth and object mask
            admissible = [g.admissible_actions for g in graph_infos]
            objs = [g.objs for g in graph_infos]
            tmpl_gt_tt, obj_mask_gt_tt = self.generate_targets(
                admissible, objs)
            # Step 5: Log template/object predictions/ground_truth
            gt_tmpls = [
                self.template_generator.templates[i] for i in tmpl_gt_tt[0].
                nonzero().squeeze().cpu().numpy().flatten().tolist()
            ]
            gt_objs = [
                self.vocab_act[i] for i in obj_mask_gt_tt[
                    0, 0].nonzero().squeeze().cpu().numpy().flatten().tolist()
            ]
            topk_o1_probs, topk_o1_idxs = F.softmax(obj_pred_tt[0, 0]).topk(5)
            topk_o1 = [self.vocab_act[o] for o in topk_o1_idxs.tolist()]
            o1_pred_str = ', '.join([
                '{} {:.3f}'.format(o, prob)
                for o, prob in zip(topk_o1, topk_o1_probs.tolist())
            ])
            chosen_actions = self.decode_actions(dec_tmpl_tt, dec_obj_tt)
            # Step 6: Next step
            obs, rewards, dones, infos, graph_infos = self.vec_env.step(
                chosen_actions)
            # Step 7: logging
            tb.logkv_mean(
                'TotalStepsPerEpisode',
                sum([i['steps'] for i in infos]) / float(len(graph_infos)))
            tb.logkv_mean('Valid', infos[0]['valid'])
            if dones[0]:
                log('Step {} EpisodeScore {}'.format(step, infos[0]['score']))
            for done, info in zip(dones, infos):
                if done:
                    tb.logkv_mean('EpisodeScore', info['score'])
            # Step 8: append into transitions
            rew_tt = torch.FloatTensor(rewards).cuda().unsqueeze(1)
            done_mask_tt = (~torch.tensor(dones)).float().cuda().unsqueeze(1)
            self.model.reset_hidden(done_mask_tt)
            transitions.append(
                (tmpl_pred_tt, obj_pred_tt, value, rew_tt, done_mask_tt,
                 tmpl_gt_tt, dec_tmpl_tt, dec_obj_tt, obj_mask_gt_tt,
                 graph_mask_tt, dec_steps))
            # Step 9: update model per 8 steps
            if len(transitions) >= self.params['bptt']:
                tb.logkv('StepsPerSecond', float(step) / (time.time() - start))
                self.model.clone_hidden()
                obs_reps = np.array([g.ob_rep for g in graph_infos])
                scores = [info['score'] for info in infos]
                graph_mask_tt = self.generate_graph_mask(graph_infos)
                graph_state_reps = [g.graph_state_rep for g in graph_infos]
                graph_rep_1 = [
                    g.graph_state_rep_1_connectivity for g in graph_infos
                ]
                graph_rep_2 = [
                    g.graph_state_rep_2_roomitem for g in graph_infos
                ]
                graph_rep_3 = [
                    g.graph_state_rep_3_youritem for g in graph_infos
                ]
                graph_rep_4 = [
                    g.graph_state_rep_4_otherroom for g in graph_infos
                ]
                _, _, _, _, next_value, _ = self.model(
                    obs_reps, scores, graph_state_reps, graph_rep_1,
                    graph_rep_2, graph_rep_3, graph_rep_4, graph_mask_tt)
                returns, advantages = self.discount_reward(
                    transitions, next_value)
                tb.logkv_mean('Advantage', advantages[-1].median().item())
                loss = self.update(transitions, returns, advantages)
                del transitions[:]
                self.model.restore_hidden()
                print("Total time: {:.2f} mins".format(
                    (time.time() - start) / 60.))
            if step % self.params['checkpoint_interval'] == 0:
                parameters = {'model': self.model}
                torch.save(parameters,
                           os.path.join(self.params['output_dir'], 'kga2c.pt'))
        self.vec_env.close_extras()

    def update(self, transitions, returns, advantages):
        assert len(transitions) == len(returns) == len(advantages)
        loss = 0
        for trans, ret, adv in zip(transitions, returns, advantages):
            tmpl_pred_tt, obj_pred_tt, value, _, _, tmpl_gt_tt, dec_tmpl_tt, \
                dec_obj_tt, obj_mask_gt_tt, graph_mask_tt, dec_steps = trans
            # Supervised Template Loss
            tmpl_probs = F.softmax(tmpl_pred_tt, dim=1)
            template_loss = self.params['template_coeff'] * self.loss_fn1(
                tmpl_probs, tmpl_gt_tt)
            # Supervised Object Loss
            object_mask_target = obj_mask_gt_tt.permute((1, 0, 2))
            obj_probs = F.softmax(obj_pred_tt, dim=2)
            object_mask_loss = self.params['object_coeff'] * self.loss_fn1(
                obj_probs, object_mask_target)
            # Build the object mask
            o1_mask, o2_mask = [0] * self.batch_size, [0] * self.batch_size
            for d, st in enumerate(dec_steps):
                if st > 1:
                    o1_mask[d] = 1
                    o2_mask[d] = 1
                elif st == 1:
                    o1_mask[d] = 1
            o1_mask = torch.FloatTensor(o1_mask).cuda()
            o2_mask = torch.FloatTensor(o2_mask).cuda()
            # Policy Gradient Loss
            policy_obj_loss = torch.FloatTensor([0]).cuda()
            cnt = 0
            for i in range(self.batch_size):
                if dec_steps[i] >= 1:
                    cnt += 1
                    batch_pred = obj_pred_tt[0, i, graph_mask_tt[i]]
                    action_log_probs_obj = F.log_softmax(batch_pred, dim=0)
                    dec_obj_idx = dec_obj_tt[0, i].item()
                    graph_mask_list = graph_mask_tt[i].nonzero().squeeze().cpu(
                    ).numpy().flatten().tolist()
                    idx = graph_mask_list.index(dec_obj_idx)
                    log_prob_obj = action_log_probs_obj[idx]
                    policy_obj_loss += -log_prob_obj * adv[i].detach()
            if cnt > 0:
                policy_obj_loss /= cnt
            tb.logkv_mean('PolicyObjLoss', policy_obj_loss.item())
            log_probs_obj = F.log_softmax(obj_pred_tt, dim=2)
            log_probs_tmpl = F.log_softmax(tmpl_pred_tt, dim=1)
            action_log_probs_tmpl = log_probs_tmpl.gather(
                1, dec_tmpl_tt).squeeze()
            policy_tmpl_loss = (-action_log_probs_tmpl *
                                adv.detach().squeeze()).mean()
            tb.logkv_mean('PolicyTemplateLoss', policy_tmpl_loss.item())
            policy_loss = policy_tmpl_loss + policy_obj_loss
            value_loss = self.params['value_coeff'] * self.loss_fn3(value, ret)
            tmpl_entropy = -(tmpl_probs * log_probs_tmpl).mean()
            tb.logkv_mean('TemplateEntropy', tmpl_entropy.item())
            object_entropy = -(obj_probs * log_probs_obj).mean()
            tb.logkv_mean('ObjectEntropy', object_entropy.item())
            # Minimizing entropy loss will lead to increased entropy
            entropy_loss = self.params['entropy_coeff'] * -(tmpl_entropy +
                                                            object_entropy)
            loss += template_loss + object_mask_loss + value_loss + entropy_loss + policy_loss
        tb.logkv('Loss', loss.item())
        tb.logkv('TemplateLoss', template_loss.item())
        tb.logkv('ObjectLoss', object_mask_loss.item())
        tb.logkv('PolicyLoss', policy_loss.item())
        tb.logkv('ValueLoss', value_loss.item())
        tb.logkv('EntropyLoss', entropy_loss.item())
        tb.dumpkvs()
        loss.backward()
        # Compute the gradient norm
        grad_norm = 0
        for p in list(
                filter(lambda p: p.grad is not None, self.model.parameters())):
            grad_norm += p.grad.data.norm(2).item()
        tb.logkv('UnclippedGradNorm', grad_norm)
        nn.utils.clip_grad_norm_(self.model.parameters(), self.params['clip'])
        # Clipped Grad norm
        grad_norm = 0
        for p in list(
                filter(lambda p: p.grad is not None, self.model.parameters())):
            grad_norm += p.grad.data.norm(2).item()
        tb.logkv('ClippedGradNorm', grad_norm)
        self.optimizer.step()
        self.optimizer.zero_grad()
        return loss

    def decode_actions(self, decoded_templates, decoded_objects):
        '''
        Returns string representations of the given template actions.
        :param decoded_template: Tensor of template indices.
        :type decoded_template: Torch tensor of size (Batch_size x 1).
        :param decoded_objects: Tensor of o1, o2 object indices.
        :type decoded_objects: Torch tensor of size (2 x Batch_size x 1).
        '''
        decoded_actions = []
        for i in range(self.batch_size):
            decoded_template = decoded_templates[i].item()
            decoded_object1 = decoded_objects[0][i].item()
            decoded_object2 = decoded_objects[1][i].item()
            decoded_action = self.tmpl_to_str(decoded_template,
                                              decoded_object1, decoded_object2)
            decoded_actions.append(decoded_action)
        return decoded_actions

    def tmpl_to_str(self, template_idx, o1_id, o2_id):
        """ Returns a string representation of a template action. """
        template_str = self.template_generator.templates[template_idx]
        holes = template_str.count('OBJ')
        assert holes <= 2
        if holes <= 0:
            return template_str
        elif holes == 1:
            return template_str.replace('OBJ', self.vocab_act[o1_id])
        else:
            return template_str.replace('OBJ', self.vocab_act[o1_id], 1)\
                               .replace('OBJ', self.vocab_act[o2_id], 1)
Esempio n. 14
0
def main():
    es = [make_env(i, args.board_size) for i in range(args.num_processes)]
    envs = VecEnv([es[i] for i in range(args.num_processes)])

    spatial_obs_space = es[0].observation_space.spaces['board'].shape
    non_spatial_space = (1, 50)
    action_space = len(es[0].actions)

    # MODELS #
    if args.resume:
        ac_agent = torch.load("models/" + args.model_name)   # Load model
    else:
        ac_agent = PrunedHybrid(spatial_obs_space[0], action_space, args.board_size)

    optimizer = optim.RMSprop(ac_agent.parameters(), args.learning_rate)

    # Creating the memory to store the steps taken
    if args.board_size == 1:
        action_space = 242
    elif args.board_size == 3:
        action_space = 492
    elif args.board_size == 5:
        action_space = 908
    else:
        raise NotImplementedError("Not able to handle board size", args.board_size)

    memory = Memory(args.num_steps, args.num_processes, spatial_obs_space, non_spatial_space, action_space)

    obs = envs.reset()
    spatial_obs, non_spatial_obs = update_obs(obs)

    memory.spatial_obs[0].copy_(torch.from_numpy(spatial_obs).float())
    memory.non_spatial_obs[0].copy_(torch.from_numpy(non_spatial_obs).float())

    if args.resume & args.log:
        log_file = "logs/" + args.log_filename
        with open(log_file) as log:
            lines = log.readlines()[-1]
            resume_updates = float(lines.split(", ")[0])
            resume_episodes = float(lines.split(", ")[1])
            resume_steps = float(lines.split(", ")[3])
    else:
        resume_updates = 0
        resume_episodes = 0
        resume_steps = 0

    renderer = Renderer()

    rewards = 0
    episodes = 0

    for update in range(args.num_updates):

        for step in range(args.num_steps):

            available_actions = envs.actions()
            active_players = envs.active_players()
            own_players = envs.own_players()

            values, actions_policy = ac_agent.act(
                Variable(memory.spatial_obs[step]),
                Variable(memory.non_spatial_obs[step]), available_actions)

            if args.board_size == 1:
                actions, x_positions, y_positions = utils.map_actions_1v1(actions_policy)
            elif args.board_size == 3:
                actions, x_positions, y_positions = utils.map_actions_3v3_new_approach(actions_policy, active_players, own_players)
            elif args.board_size == 5:
                actions, x_positions, y_positions = utils.map_actions_5v5_pruned(actions_policy, active_players, own_players)
            else:
                raise NotImplementedError("Not able to handle board size", args.board_size)

            action_objects = []

            for action, position_x, position_y in zip(actions, x_positions, y_positions):

                action_object = {
                    'action-type': action,
                    'x': position_x,
                    'y': position_y
                    }
                action_objects.append(action_object)

            obs, reward, done, info, events = envs.step(action_objects)

            if args.render:
                for i in range(args.num_processes):
                    renderer.render(obs[i], i)

            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            rewards += reward.sum().item()

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            dones = masks.squeeze()
            episodes += args.num_processes - dones.sum().item()

            # Update the observations returned by the environment
            spatial_obs, non_spatial_obs = update_obs(obs)

            # insert the step taken into memory
            memory.insert(step, torch.from_numpy(spatial_obs).float(), torch.from_numpy(non_spatial_obs).float(),
                          torch.tensor(actions_policy), torch.tensor(values), reward, masks, available_actions)

        next_value = ac_agent(Variable(memory.spatial_obs[-1]), Variable(memory.non_spatial_obs[-1]))[0].data

        # Compute returns
        memory.compute_returns(next_value, args.gamma)

        spatial = Variable(memory.spatial_obs[:-1])  # shape [20,  4, 26,  7, 14]
        spatial = spatial.view(-1, *spatial_obs_space)  # shape [80, 26,  7, 14]
        non_spatial = Variable(memory.non_spatial_obs[:-1])  # shape [20,  4,  1, 49]
        non_spatial = non_spatial.view(-1, 50)  # shape [80, 49]

        actions = Variable(torch.LongTensor(memory.actions.view(-1, 1)))
        actions_mask = Variable(memory.available_actions[:-1])

        # Evaluate the actions taken
        action_log_probs, values, dist_entropy = ac_agent.evaluate_actions(Variable(spatial),
                                                                           Variable(non_spatial),
                                                                           actions, actions_mask)

        values = values.view(args.num_steps, args.num_processes, 1)
        action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1)

        advantages = Variable(memory.returns[:-1]) - values
        value_loss = advantages.pow(2).mean()

        # Compute loss
        action_loss = -(Variable(advantages.data) * action_log_probs).mean()

        optimizer.zero_grad()

        total_loss = (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef)
        total_loss.backward()

        nn.utils.clip_grad_norm_(ac_agent.parameters(), args.max_grad_norm)

        optimizer.step()

        memory.non_spatial_obs[0].copy_(memory.non_spatial_obs[-1])
        memory.spatial_obs[0].copy_(memory.spatial_obs[-1])

        # Logging
        if (update + 1) % args.log_interval == 0 and args.log:
            log_file_name = "logs/" + args.log_filename

            # Updates
            updates = update + 1
            resume_updates += updates
            # Episodes
            resume_episodes += episodes
            # Steps
            steps = args.num_processes * args.num_steps
            resume_steps += steps
            # Rewards
            reward = rewards

            mean_reward_pr_episode = reward / episodes

            log = "Updates {}, Episodes {}, Episodes this update {}, Total Timesteps {}, Reward {}, Mean Reward pr. Episode {:.2f}"\
                .format(resume_updates, resume_episodes, episodes, resume_steps, reward, mean_reward_pr_episode)

            log_to_file = "{}, {}, {}, {}, {}, {}\n" \
                .format(resume_updates, resume_episodes, episodes, resume_steps, reward, mean_reward_pr_episode)

            print(log)

            # Save to files
            with open(log_file_name, "a") as myfile:
                myfile.write(log_to_file)

            # Saving the agent
            torch.save(ac_agent, "models/" + args.model_name)

            rewards = 0
            episodes = 0
Esempio n. 15
0
    help='vizdoom configuration file path (default: ./scenarios/basic.cfg)')
parser.add_argument(
    '--load-dir',
    default='./trained_models/',
    help='directory to save agent logs (default: ./trained_models/)')
parser.add_argument('--log-dir',
                    default='/tmp/doom/',
                    help='directory to save agent logs (default: /tmp/doom)')
args = parser.parse_args()

try:
    os.makedirs(args.log_dir)
except OSError:
    pass

envs = VecEnv([make_visual_env(args.config_path)])

actor_critic = torch.load(os.path.join(args.load_dir, args.env_name + ".pt"))
actor_critic.eval()

obs_shape = envs.observation_space_shape
obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])
current_obs = torch.zeros(1, *obs_shape)


def update_current_obs(obs):
    shape_dim0 = envs.observation_space_shape[0]
    obs = torch.from_numpy(obs).float()
    if args.num_stack > 1:
        current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
    current_obs[:, -shape_dim0:] = obs
Esempio n. 16
0
    def start_interaction(self, env_fns, dynamics, nlump=2):
        self.loss_names, self._losses = zip(*list(self.to_report.items()))
        self.global_step = tf.Variable(0, trainable=False)
        params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
        if MPI.COMM_WORLD.Get_size() > 1:
            if self.agent_num is None:
                trainer = MpiAdamOptimizer(learning_rate=self.ph_lr,
                                           comm=MPI.COMM_WORLD)

        else:
            if self.agent_num is None:
                if self.optim == 'adam':
                    trainer = tf.train.AdamOptimizer(learning_rate=self.ph_lr)
                elif self.optim == 'sgd':
                    print("using sgd")
                    print("________________________")
                    if self.decay:
                        self.decay_lr = tf.train.exponential_decay(
                            self.ph_lr,
                            self.global_step,
                            2500,
                            .96,
                            staircase=True)
                        trainer = tf.train.GradientDescentOptimizer(
                            learning_rate=self.decay_lr)
                    else:
                        trainer = tf.train.GradientDescentOptimizer(
                            learning_rate=self.ph_lr)
                elif self.optim == 'momentum':
                    print('using momentum')
                    print('________________________')
                    trainer = tf.train.MomentumOptimizer(
                        learning_rate=self.ph_lr, momentum=0.9)
        if self.agent_num is None:
            gradsandvars = trainer.compute_gradients(self.total_loss, params)
            l2_norm = lambda t: tf.sqrt(tf.reduce_sum(tf.pow(t, 2)))
            if self.log_grads:
                for grad, var in gradsandvars:
                    tf.summary.histogram(var.name + '/gradient', l2_norm(grad))
                    tf.summary.histogram(var.name + '/value', l2_norm(var))
                    grad_mean = tf.reduce_mean(tf.abs(grad))
                    tf.summary.scalar(var.name + '/grad_mean', grad_mean)
                if self.decay:
                    tf.summary.scalar('decay_lr', self.decay_lr)
                self._summary = tf.summary.merge_all()
                tf.add_to_collection("summary_op", self._summary)
            if self.grad_clip > 0:
                grads, gradvars = zip(*gradsandvars)
                grads, _ = tf.clip_by_global_norm(grads, self.grad_clip)
                gradsandvars = list(zip(grads, gradvars))

            self._train = trainer.apply_gradients(gradsandvars,
                                                  global_step=self.global_step)
            self._updates = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            self._train = tf.group(self._train, self._updates)
            tf.add_to_collection("train_op", self._train)
        else:
            self._train = tf.get_collection("train_op")[0]
            if self.log_grads:
                self._summary = tf.get_collection("summary_op")[0]

        if MPI.COMM_WORLD.Get_rank() == 0:
            getsess().run(
                tf.variables_initializer(
                    tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)))
        bcast_tf_vars_from_root(
            getsess(), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))

        self.all_visited_rooms = []
        self.all_scores = []
        self.nenvs = nenvs = len(env_fns)
        self.nlump = nlump
        self.lump_stride = nenvs // self.nlump
        self.envs = [
            VecEnv(env_fns[l * self.lump_stride:(l + 1) * self.lump_stride],
                   spaces=[self.env_ob_space, self.ac_space])
            for l in range(self.nlump)
        ]

        self.rollout = Rollout(ob_space=self.ob_space,
                               ac_space=self.ac_space,
                               nenvs=nenvs,
                               nsteps_per_seg=self.nsteps_per_seg,
                               nsegs_per_env=self.nsegs_per_env,
                               nlumps=self.nlump,
                               envs=self.envs,
                               policy=self.stochpol,
                               int_rew_coeff=self.int_coeff,
                               ext_rew_coeff=self.ext_coeff,
                               record_rollouts=self.use_recorder,
                               dynamics=dynamics,
                               exp_name=self.exp_name,
                               env_name=self.env_name,
                               video_log_freq=self.video_log_freq,
                               model_save_freq=self.model_save_freq,
                               use_apples=self.use_apples,
                               multi_envs=self.multi_envs,
                               lstm=self.lstm,
                               lstm1_size=self.lstm1_size,
                               lstm2_size=self.lstm2_size,
                               depth_pred=self.depth_pred,
                               early_stop=self.early_stop,
                               aux_input=self.aux_input)

        self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32)
        self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32)

        if self.normrew:
            self.rff = RewardForwardFilter(self.gamma)
            self.rff_rms = RunningMeanStd()

        self.step_count = 0
        self.t_last_update = time.time()
        self.t_start = time.time()
    def start_interaction(self, env_fns, dynamics, nlump=2):
        self.loss_names, self._losses = zip(*list(self.to_report.items()))

        params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
        if MPI.COMM_WORLD.Get_size() > 1:
            trainer = MpiAdamOptimizer(learning_rate=self.ph_lr,
                                       comm=MPI.COMM_WORLD)
        else:
            trainer = tf.train.AdamOptimizer(learning_rate=self.ph_lr)

        #gvs = trainer.compute_gradients(self.total_loss, params)
        #self.gshape = gs
        #gs = [g for (g,v) in gvs]
        #self.normg = tf.linalg.global_norm(gs)
        #new_g = [tf.clip_by_norm(g,10.0) for g in gs i]
        #self.nnormg = tf.linalg.global_norm(new_g)
        def ClipIfNotNone(grad):
            return tf.clip_by_value(grad, -25.0,
                                    25.0) if grad is not None else grad

        gradsandvars = trainer.compute_gradients(self.total_loss, params)
        #gs = [g for (g,v) in gradsandvars]
        #new_g = [tf.clip_by_norm(g,10.0) for g in gs if g is not None]
        gradsandvars = [(ClipIfNotNone(g), v) for g, v in gradsandvars]

        #new_g = [g for (g,v) in gradsandvars]
        #self.nnormg = tf.linalg.global_norm(new_g)
        #gradsandvars = [(ClipIfNotNone(grad), var) for grad, var in gradsandvars]
        self._train = trainer.apply_gradients(gradsandvars)

        if MPI.COMM_WORLD.Get_rank() == 0:
            getsess().run(
                tf.variables_initializer(
                    tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)))
        bcast_tf_vars_from_root(
            getsess(), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))

        self.all_visited_rooms = []
        self.all_scores = []
        self.nenvs = nenvs = len(env_fns)
        print('-------NENVS-------', self.nenvs)
        self.nlump = nlump
        print('----------NLUMPS-------', self.nlump)
        self.lump_stride = nenvs // self.nlump
        print('-------LSTRIDE----', self.lump_stride)
        print('--------OBS SPACE ---------', self.ob_space)
        print('-------------AC SPACE-----', self.ac_space)
        #assert 1==2
        print('-----BEFORE VEC ENV------')
        self.envs = [
            VecEnv(env_fns[l * self.lump_stride:(l + 1) * self.lump_stride],
                   spaces=[self.ob_space, self.ac_space])
            for l in range(self.nlump)
        ]
        print('-----AFTER VEC ENV------')
        self.rollout = Rollout(ob_space=self.ob_space,
                               ac_space=self.ac_space,
                               nenvs=nenvs,
                               nsteps_per_seg=self.nsteps_per_seg,
                               nsegs_per_env=self.nsegs_per_env,
                               nlumps=self.nlump,
                               envs=self.envs,
                               policy=self.stochpol,
                               int_rew_coeff=self.int_coeff,
                               ext_rew_coeff=self.ext_coeff,
                               record_rollouts=self.use_recorder,
                               dynamics=dynamics,
                               exp_name=self.exp_name,
                               env_name=self.env_name,
                               to_eval=self.to_eval)

        self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32)
        self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32)

        if self.normrew:
            self.rff = RewardForwardFilter(self.gamma)
            self.rff_rms = RunningMeanStd()

        self.step_count = 0
        self.t_last_update = time.time()
        self.t_start = time.time()
        self.saver = tf.train.Saver(max_to_keep=5)