def callback(self, lcl, glb):
     self.iter += 1
     if self.iter == 1:
         self.sess = lcl['sess']
         self.callback_setup_saver()
         self.logging.writer_val.add_graph(lcl['sess'].graph)
         self.callback_val_vis(lcl,
                               glb,
                               num_rollouts=self.other_kwargs['num_valid'],
                               plot=True)
         return
     t = lcl['t']
     print_freq = lcl['print_freq']
     if t > self.trainer_kwargs['learning_starts']:
         if t % print_freq == 0:
             logger.error('Num Steps: %d' % (t))
             self.callback_logging(lcl, glb)
             self.callback_val_vis(
                 lcl,
                 glb,
                 num_rollouts=self.other_kwargs['num_valid'],
                 plot=True)
             logger.error('')
         if t % (print_freq * 15) == 0:
             self.callback_snapshot(lcl, glb)
Exemple #2
0
def crowdai_submit(seed, noise_type, layer_norm, evaluation, **kwargs):
    if 'restore_model_name' not in kwargs:
        logger.error(
            'You must specify the --restore-model-name in order to submit')
        sys.exit()
    remote_base = "http://grader.crowdai.org:1729"
    crowdai_token = kwargs['crowdai_token']
    crowdai_client = Client(remote_base)
    kwargs['crowdai_client'] = crowdai_client
    evaluate(seed, noise_type, layer_norm, evaluation=True, **kwargs)
Exemple #3
0
def get_tf_reward(env):
    while hasattr(env, "wrapped_env") or hasattr(env, "env") or hasattr(
            env, "get_tf_reward"):
        if hasattr(env, "get_tf_reward"):
            return env.get_tf_reward()
        elif hasattr(env, "wrapped_env"):
            env = env.wrapped_env
        else:
            env = env.env
    logger.error("env should have the attribution get_tf_reward()")
Exemple #4
0
def build():
    lrank, _lsize = mpi_util.get_local_rank_size(MPI.COMM_WORLD)
    if lrank == 0:
        dirname = os.path.dirname(__file__)
        if len(dirname):
            make_cmd = "QT_SELECT=5 make -C %s" % dirname
        else:
            make_cmd = "QT_SELECT=5 make"

        r = os.system(make_cmd)
        if r != 0:
            logger.error('coinrun: make failed')
            sys.exit(1)
    MPI.COMM_WORLD.barrier()
    def callback_val_vis(self, lcl, glb, num_rollouts, plot=False):
        act = lcl['act']
        global_step = lcl['t']
        with plt.style.context("fivethirtyeight"):
            plt.rcParams["axes.grid"] = True
            env = self.env_val
            env.reset_rng()
            obsss, actionss, rewardss = [], [], []
            ms = []
            for i in range(num_rollouts):
                obs, done = env.reset(), False
                obss, actions, rewards = [obs], [], []
                while not done:
                    action = act(obs[None], False)[0]
                    obs, rew, done, _ = env.step(action)
                    obss.append(obs)
                    actions.append(action)
                    rewards.append(rew)
                obss.pop()  #last obs is unnecessary
                obsss.append(obss)
                actionss.append(actions)
                rewardss.append(rewards)
                m = env.get_metrics()
                ms.append(m)

            metric_names, metric_vals = env.collect_metrics(ms)
            metric_summary_init = tf.summary.Summary()
            metric_summary_end = tf.summary.Summary()
            for k, v in zip(metric_names, metric_vals):
                add_value_to_summary(metric_summary_init,
                                     'metrics/{:s}'.format(k),
                                     v,
                                     log=True,
                                     tag_str='metrics/{:s}: '.format(k))
                add_value_to_summary(metric_summary_end,
                                     'metrics/{:s}'.format(k),
                                     v,
                                     log=False,
                                     tag_str='metrics/{:s}: '.format(k))

            self.logging.writer_val.add_summary(metric_summary_init,
                                                global_step)
            logger.error('')
Exemple #6
0
def set_seed(env, seed):
    random.seed(seed)
    np.random.seed(seed)
    try:
        import tensorflow as tf
        tf.set_random_seed(seed)
    except Exception as e:
        print(e)

    while hasattr(env, "wrapped_env") or hasattr(env, "env") or hasattr(
            env, "seed"):
        if hasattr(env, "seed"):
            temp_seed = env.seed(seed)
            if temp_seed != None and temp_seed != []:
                logger.info("Seed: %d. Set seed successfully" %
                            env.seed(seed)[0])
                return
        if hasattr(env, "wrapped_env"):
            env = env.wrapped_env
        else:
            env = env.env
    logger.error("env should have the attribution seed()")
Exemple #7
0
def evaluate_one_episode(env, agent, nb_eval_steps, render):
    if nb_eval_steps <= 0:
        logger.error('evaluate_one_episode nb_eval_steps must be > 0')
    reward = 0.
    qs = []
    obs = env.reset()
    for step in range(nb_eval_steps):
        action, q = agent.pi(obs, apply_noise=False, compute_Q=True)
        obs, r, done, info = env.step(action)
        if render:
            env.render()
        reward += r
        qs.append(q)
        if done:
            #obs = env.reset()
            break  # the original baseline code didn't have this break statement, so would average multiple evaluation episodes
        elif step >= nb_eval_steps:
            logger.warn('evaluate_one_episode step', step,
                        'exceeded nb_eval_steps', nb_eval_steps,
                        'but done is False')
            #obs = env.reset()
            break
    return reward, np.mean(qs), step + 1
    def find_character_in_frame(self, frame):
        mask = cv2.inRange(frame, self.lower_color, self.upper_color)
        output = cv2.bitwise_and(frame, frame, mask=mask)

        pix_x, pix_y, _ = np.where(output > 0)
        if pix_x.size != 0:
            prev_pix_x = pix_x
            pix_x = pix_x[np.where(pix_x > 19)]
            pix_y = pix_y[-pix_x.size:]

            # If array is even then median doesn't exist in the array, because it's the average
            # between the middle twos
            try:
                # Very rarely a nan will be received here
                median_x = int(np.median(pix_x))
                while median_x not in pix_x:
                    median_x += 1

                median_y = int(pix_y[np.where(pix_x == median_x)[0][0]])
            except Exception as e:
                logger.error("Exception: {}".format(e))
                logger.error("Pixel x: {}".format(pix_x))
                logger.error("Pixel y: {}".format(pix_y))
                logger.error("Previous pixel x: {}".format(prev_pix_x))
                roi = np.zeros([self.ego_h, self.ego_w, 3], dtype=np.uint8)
                return roi

        else:
            median_x = output.shape[0] // 2
            median_y = output.shape[1] // 2

        low_x = median_x - self.ego_h
        high_x = median_x + self.ego_h
        low_y = median_y - self.ego_w
        high_y = median_y + self.ego_w

        low_x = low_x if low_x > 0 else 0
        high_x = high_x if high_x < frame.shape[0] else frame.shape[0]
        low_y = low_y if low_y > 0 else 0
        high_y = high_y if high_y < frame.shape[1] else frame.shape[1]

        roi = frame[low_x:high_x, low_y:high_y]
        return roi
 def callback_snapshot(self, lcl, glb):
     model_file_name = os.path.join(self.logdir, 'snapshots', 'model')
     self.logging.saver.save(lcl['sess'],
                             model_file_name,
                             global_step=lcl['num_episodes'])
     logger.error('Saving model to: ', model_file_name)
    def update(self):

        #Some logic gathering best ret, rooms etc using MPI.
        temp = sum(MPI.COMM_WORLD.allgather(self.local_rooms), [])
        temp = sorted(list(set(temp)))
        self.rooms = temp

        temp = sum(MPI.COMM_WORLD.allgather(self.scores), [])
        temp = sorted(list(set(temp)))
        self.scores = temp

        temp = sum(MPI.COMM_WORLD.allgather([self.local_best_ret]), [])
        self.best_ret = max(temp)

        eprews = MPI.COMM_WORLD.allgather(
            np.mean(list(self.I.statlists["eprew"])))
        local_best_rets = MPI.COMM_WORLD.allgather(self.local_best_ret)
        n_rooms = sum(MPI.COMM_WORLD.allgather([len(self.local_rooms)]), [])

        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.info(f"Rooms visited {self.rooms}")
            logger.info(f"Best return {self.best_ret}")
            logger.info(f"Best local return {sorted(local_best_rets)}")
            logger.info(f"eprews {sorted(eprews)}")
            logger.info(f"n_rooms {sorted(n_rooms)}")
            logger.info(f"Extrinsic coefficient {self.ext_coeff}")
            logger.info(f"Gamma {self.gamma}")
            logger.info(f"Gamma ext {self.gamma_ext}")
            logger.info(f"All scores {sorted(self.scores)}")

        #Normalize intrinsic rewards.
        rffs_int = np.array(
            [self.I.rff_int.update(rew) for rew in self.I.buf_rews_int.T])
        self.I.rff_rms_int.update(rffs_int.ravel())
        rews_int = self.I.buf_rews_int / np.sqrt(self.I.rff_rms_int.var)
        self.mean_int_rew = np.mean(rews_int)
        self.max_int_rew = np.max(rews_int)

        #Don't normalize extrinsic rewards.
        rews_ext = self.I.buf_rews_ext

        rewmean, rewstd, rewmax = self.I.buf_rews_int.mean(
        ), self.I.buf_rews_int.std(), np.max(self.I.buf_rews_int)

        #Calculate intrinsic returns and advantages.
        lastgaelam = 0
        for t in range(self.nsteps - 1, -1, -1):  # nsteps-2 ... 0
            if self.use_news:
                nextnew = self.I.buf_news[:, t +
                                          1] if t + 1 < self.nsteps else self.I.buf_new_last
            else:
                nextnew = 0.0  #No dones for intrinsic reward.
            nextvals = self.I.buf_vpreds_int[:, t +
                                             1] if t + 1 < self.nsteps else self.I.buf_vpred_int_last
            nextnotnew = 1 - nextnew
            delta = rews_int[:,
                             t] + self.gamma * nextvals * nextnotnew - self.I.buf_vpreds_int[:,
                                                                                             t]
            self.I.buf_advs_int[:,
                                t] = lastgaelam = delta + self.gamma * self.lam * nextnotnew * lastgaelam
        rets_int = self.I.buf_advs_int + self.I.buf_vpreds_int

        #Calculate extrinsic returns and advantages.
        lastgaelam = 0
        for t in range(self.nsteps - 1, -1, -1):  # nsteps-2 ... 0
            nextnew = self.I.buf_news[:, t +
                                      1] if t + 1 < self.nsteps else self.I.buf_new_last
            #Use dones for extrinsic reward.
            nextvals = self.I.buf_vpreds_ext[:, t +
                                             1] if t + 1 < self.nsteps else self.I.buf_vpred_ext_last
            nextnotnew = 1 - nextnew
            delta = rews_ext[:,
                             t] + self.gamma_ext * nextvals * nextnotnew - self.I.buf_vpreds_ext[:,
                                                                                                 t]
            self.I.buf_advs_ext[:,
                                t] = lastgaelam = delta + self.gamma_ext * self.lam * nextnotnew * lastgaelam
        rets_ext = self.I.buf_advs_ext + self.I.buf_vpreds_ext

        #Combine the extrinsic and intrinsic advantages.
        self.I.buf_advs = self.int_coeff * self.I.buf_advs_int + self.ext_coeff * self.I.buf_advs_ext

        #Collects info for reporting.
        info = dict(
            advmean=self.I.buf_advs.mean(),
            advstd=self.I.buf_advs.std(),
            retintmean=rets_int.mean(),  # previously retmean
            retintstd=rets_int.std(),  # previously retstd
            retextmean=rets_ext.mean(),  # previously not there
            retextstd=rets_ext.std(),  # previously not there
            rewintmean_unnorm=rewmean,  # previously rewmean
            rewintmax_unnorm=rewmax,  # previously not there
            rewintmean_norm=self.mean_int_rew,  # previously rewintmean
            rewintmax_norm=self.max_int_rew,  # previously rewintmax
            rewintstd_unnorm=rewstd,  # previously rewstd
            vpredintmean=self.I.buf_vpreds_int.mean(),  # previously vpredmean
            vpredintstd=self.I.buf_vpreds_int.std(),  # previously vrpedstd
            vpredextmean=self.I.buf_vpreds_ext.mean(),  # previously not there
            vpredextstd=self.I.buf_vpreds_ext.std(),  # previously not there
            ev_int=np.clip(
                explained_variance(self.I.buf_vpreds_int.ravel(),
                                   rets_int.ravel()), -1, None),
            ev_ext=np.clip(
                explained_variance(self.I.buf_vpreds_ext.ravel(),
                                   rets_ext.ravel()), -1, None),
            rooms=SemicolonList(self.rooms),
            n_rooms=len(self.rooms),
            best_ret=self.best_ret,
            reset_counter=self.I.reset_counter)

        info[f'mem_available'] = psutil.virtual_memory().available

        to_record = {
            'acs': self.I.buf_acs,
            'rews_int': self.I.buf_rews_int,
            'rews_int_norm': rews_int,
            'rews_ext': self.I.buf_rews_ext,
            'vpred_int': self.I.buf_vpreds_int,
            'vpred_ext': self.I.buf_vpreds_ext,
            'adv_int': self.I.buf_advs_int,
            'adv_ext': self.I.buf_advs_ext,
            'ent': self.I.buf_ent,
            'ret_int': rets_int,
            'ret_ext': rets_ext,
        }

        if self.I.venvs[0].record_obs:
            if None in self.I.buf_obs:
                to_record['obs'] = self.I.buf_obs[None]
            else:
                to_record['obs'] = self.I.buf_obs['normal']

        self.recorder.record(bufs=to_record, infos=self.I.buf_epinfos)

        #Create feeddict for optimization.
        envsperbatch = self.I.nenvs // self.nminibatches
        ph_buf = [
            (self.stochpol.ph_ac, self.I.buf_acs),
            (self.ph_ret_int, rets_int),
            (self.ph_ret_ext, rets_ext),
            (self.ph_oldnlp, self.I.buf_nlps),
            (self.ph_adv, self.I.buf_advs),
        ]
        if self.I.mem_state is not NO_STATES:
            ph_buf.extend([
                (self.stochpol.ph_istate, self.I.seg_init_mem_state),
                (self.stochpol.ph_new, self.I.buf_news),
            ])

        #verbose = True
        verbose = False
        if verbose and self.is_log_leader:
            samples = np.prod(self.I.buf_advs.shape)
            logger.info(
                "buffer shape %s, samples_per_mpi=%i, mini_per_mpi=%i, samples=%i, mini=%i "
                % (str(self.I.buf_advs.shape), samples, samples //
                   self.nminibatches, samples * self.comm_train_size,
                   samples * self.comm_train_size // self.nminibatches))
            logger.info(" " * 6 + fmt_row(13, self.loss_names))

        to_record_attention = None
        attention_output = None
        if os.environ['EXPERIMENT_LVL'] == 'attention' or os.environ[
                'EXPERIMENT_LVL'] == 'ego':
            try:
                #attention_output = tf.get_default_graph().get_tensor_by_name("ppo/pol/augmented2/attention_output_combined:0")
                #attention_output = tf.get_default_graph().get_tensor_by_name("ppo/pol/augmented2/attention_output_combined/kernel:0")
                attention_output = tf.get_default_graph().get_tensor_by_name(
                    "ppo/pol/augmented2/attention_output_combined/Conv2D:0")
            except Exception as e:
                logger.error("Exception in attention_output: {}".format(e))
                attention_output = None

        epoch = 0
        start = 0
        #Optimizes on current data for several epochs.
        while epoch < self.nepochs:
            end = start + envsperbatch
            mbenvinds = slice(start, end, None)

            fd = {ph: buf[mbenvinds] for (ph, buf) in ph_buf}
            fd.update({self.ph_lr: self.lr, self.ph_cliprange: self.cliprange})

            if None in self.stochpol.ph_ob:
                fd[self.stochpol.ph_ob[None]] = np.concatenate([
                    self.I.buf_obs[None][mbenvinds],
                    self.I.buf_ob_last[None][mbenvinds, None]
                ], 1)
                assert list(fd[self.stochpol.ph_ob[None]].shape) == [self.I.nenvs//self.nminibatches, self.nsteps + 1] + list(self.ob_space.shape), \
                [fd[self.stochpol.ph_ob[None]].shape, [self.I.nenvs//self.nminibatches, self.nsteps + 1] + list(self.ob_space.shape)]

            else:
                fd[self.stochpol.ph_ob['normal']] = np.concatenate([
                    self.I.buf_obs['normal'][mbenvinds],
                    self.I.buf_ob_last['normal'][mbenvinds, None]
                ], 1)
                fd[self.stochpol.ph_ob['ego']] = np.concatenate([
                    self.I.buf_obs['ego'][mbenvinds],
                    self.I.buf_ob_last['ego'][mbenvinds, None]
                ], 1)

                assert list(fd[self.stochpol.ph_ob['normal']].shape) == [self.I.nenvs//self.nminibatches, self.nsteps + 1] + list(self.ob_space.spaces['normal'].shape), \
                [fd[self.stochpol.ph_ob['normal']].shape, [self.I.nenvs//self.nminibatches, self.nsteps + 1] + list(self.ob_space.spaces['normal'].shape)]
                assert list(fd[self.stochpol.ph_ob['ego']].shape) == [self.I.nenvs//self.nminibatches, self.nsteps + 1] + list(self.ob_space.spaces['ego'].shape), \
                [fd[self.stochpol.ph_ob['ego']].shape, [self.I.nenvs//self.nminibatches, self.nsteps + 1] + list(self.ob_space.spaces['ego'].shape)]

            fd.update({
                self.stochpol.ph_mean: self.stochpol.ob_rms.mean,
                self.stochpol.ph_std: self.stochpol.ob_rms.var**0.5
            })

            if attention_output is not None:
                _train_losses = [attention_output, self._train]
            else:
                _train_losses = [self._train]

            ret = tf.get_default_session().run(self._losses + _train_losses,
                                               feed_dict=fd)[:-1]

            if attention_output is not None:
                attn_output = ret[-1]
                ret = ret[:-1]
                if None in self.I.buf_obs:
                    outshape = list(
                        self.I.buf_obs[None][mbenvinds].shape[:2]) + list(
                            attn_output.shape[1:])
                else:
                    # does not matter if it's normal or ego, the first 2 axes are the same
                    outshape = list(
                        self.I.buf_obs['normal'][mbenvinds].shape[:2]) + list(
                            attn_output.shape[1:])
                attn_output = np.reshape(attn_output, outshape)
                attn_output = attn_output[:, :, :, :, :64]

            if not self.testing:
                lossdict = dict(zip([n for n in self.loss_names], ret), axis=0)
            else:
                lossdict = {}
            #Synchronize the lossdict across mpi processes, otherwise weights may be rolled back on one process but not another.
            _maxkl = lossdict.pop('maxkl')
            lossdict = dict_gather(self.comm_train, lossdict, op='mean')
            maxmaxkl = dict_gather(self.comm_train, {"maxkl": _maxkl},
                                   op='max')
            lossdict["maxkl"] = maxmaxkl["maxkl"]
            if verbose and self.is_log_leader:
                logger.info(
                    "%i:%03i %s" %
                    (epoch, start,
                     fmt_row(13, [lossdict[n] for n in self.loss_names])))
            start += envsperbatch
            if start == self.I.nenvs:
                epoch += 1
                start = 0

                if attention_output is not None:
                    if to_record_attention is None:
                        to_record_attention = attn_output
                    else:
                        to_record_attention = np.concatenate(
                            [to_record_attention, attn_output])

        # if to_record_attention is not None:
        #     if None in self.I.buf_obs:
        #         to_record['obs'] = self.I.buf_obs[None]
        #     else:
        #         to_record['obs'] = self.I.buf_obs['normal']

        #     to_record['attention'] = to_record_attention

        to_record_attention = None

        if self.is_train_leader:
            self.I.stats["n_updates"] += 1
            info.update([('opt_' + n, lossdict[n]) for n in self.loss_names])
            tnow = time.time()
            info['tps'] = self.nsteps * self.I.nenvs / (tnow -
                                                        self.I.t_last_update)
            info['time_elapsed'] = time.time() - self.t0
            self.I.t_last_update = tnow
        self.stochpol.update_normalization(  # Necessary for continuous control tasks with odd obs ranges, only implemented in mlp policy,
            ob=self.I.buf_obs  # NOTE: not shared via MPI
        )
        return info
Exemple #11
0
def learn(env_list, policy_fn, *,
          timesteps_per_actorbatch,  # timesteps per actor per update
          clip_param, entcoeff,  # clipping parameter epsilon, entropy coeff
          optim_epochs, optim_stepsize, optim_batchsize,  # optimization hypers
          gamma, lam,  # advantage estimation
          max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0,  # time constraint
          callback=None,  # you can do anything in the callback, since it takes locals(), globals()
          adam_epsilon=1e-5,
          schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
          end_timesteps,
          newround
          ):

    env = env_list.popleft()
    # Open a file to record the accumulated rewards
    rewFile = open("reward/%d.txt" % (env.seed), "ab")
    resptimeFile = open("respTime/%d.txt" % (env.seed), "ab")
    pktnumFile = open("pktNum/%d.txt" % (env.seed), "ab")

    # Setup losses and stuff
    # ----------------------------------------
    vf_ob_space = env.vf_observation_space
    # ac_ob_space = env.ac_observation_space
    ac_space = env.action_space
    pi = policy_fn("pi1", vf_ob_space, ac_space)  # Construct network for new policy
    oldpi = policy_fn("oldpi", vf_ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(name="atarg", dtype=tf.float32, shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(name="ret", dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed clipping parameter epislon

    vf_ob = U.get_placeholder_cached(name="vf_ob")
    nn_in = U.get_placeholder_cached(name="nn_in")  # placeholder for nn input
    ac = pi.pdtype.sample_placeholder([None])

    # kloldnew = oldpi.pd.kl(pi.pd)
    # ent = pi.pd.entropy()
    pb_old_holder = tf.placeholder(name="pd_old", dtype=tf.float32, shape=[None, ac_space.n])
    pb_new_holder = tf.placeholder(name="pd_new", dtype=tf.float32, shape=[None, ac_space.n])
    oldpd = CategoricalPd(pb_old_holder)
    pd = CategoricalPd(pb_new_holder)
    kloldnew = oldpd.kl(pd)
    ent = pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    # ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    ratio = tf.placeholder(dtype=tf.float32, shape=[None])
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg  #
    pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    vf_var_list = [v for v in var_list if v.name.split("/")[1].startswith("vf")]
    pol_var_list = [v for v in var_list if v.name.split("/")[1].startswith("pol")]

    vf_grad = U.function([vf_ob, ret], U.flatgrad(vf_loss, vf_var_list))  # gradient of value function
    pol_nn_grad = U.function([nn_in], U.flatgrad(pi.nn_out, pol_var_list))
    vf_adam = MpiAdam(vf_var_list, epsilon=adam_epsilon)
    pol_adam = MpiAdam(pol_var_list, epsilon=adam_epsilon)
    clip_para = U.function([lrmult], [clip_param])

    assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv)
                                                    for (oldv, newv) in
                                                    zipsame(oldpi.get_variables(), pi.get_variables())])
    compute_losses = U.function([vf_ob, atarg, ret, lrmult, ratio, pb_new_holder, pb_old_holder], losses)

    U.initialize()
    vf_adam.sync()
    pol_adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True)

    end_timestep = end_timesteps.popleft()
    new = newround.popleft()
    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=10)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=10)  # rolling buffer for episode rewards
    env_so_far = 1

    assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0,
                max_seconds > 0]) == 1, "Only one time constraint permitted"

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            rewFile.close()
            resptimeFile.close()
            pktnumFile.close()
            para = {}
            for vf in range(len(vf_var_list)):
                # para[vf_var_list[vf].name] = vf_var_list[vf].eval()
                para[vf] = vf_var_list[vf].eval()
            for pol in range(len(pol_var_list)):
                # para[pol_var_list[pol].name] = pol_var_list[pol].eval()
                para[pol + len(vf_var_list)] = pol_var_list[pol].eval()
            f = open("network/%d-%d.txt" % (env.seed, timesteps_so_far), "wb")
            pickle.dump(para, f)
            f.close()
            print("============================= policy is stored =================================")
            break
        elif end_timestep and timesteps_so_far >= end_timestep:
            env = env_list.popleft()
            seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True)
            end_timestep = end_timesteps.popleft()
            new = newround.popleft()
            env_so_far += 1
            if True:
                para = {}
                for vf in range(len(vf_var_list)):
                    # para[vf_var_list[vf].name] = vf_var_list[vf].eval()
                    para[vf] = vf_var_list[vf].eval()
                for pol in range(len(pol_var_list)):
                    # para[pol_var_list[pol].name] = pol_var_list[pol].eval()
                    para[pol + len(vf_var_list)] = pol_var_list[pol].eval()
                f = open("network/%d-%d.txt" % (env.seed, timesteps_so_far), "wb")
                pickle.dump(para, f)
                f.close()
            print("======================== new environment (%s network settings left) ===========================" % len(env_list))
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break
        elif timesteps_so_far == 0:
            para = {}
            for vf in range(len(vf_var_list)):
                # para[vf_var_list[vf].name] = vf_var_list[vf].eval()
                para[vf] = vf_var_list[vf].eval()
            for pol in range(len(pol_var_list)):
                # para[pol_var_list[pol].name] = pol_var_list[pol].eval()
                para[pol + len(vf_var_list)] = pol_var_list[pol].eval()
            f = open("network/%d-%d.txt" % (env.seed, timesteps_so_far), "wb")
            pickle.dump(para, f)
            f.close()

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i, Environment %i ************" % (iters_so_far, env_so_far))

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # for vf in range(len(vf_var_list)):
        #     print(vf_var_list[vf].name, vf_var_list[vf].eval())
        # for pol in range(len(pol_var_list)):
        #     print(pol_var_list[pol].name, pol_var_list[pol].eval())

        record_reward(rewFile, sum(seg["rew"]))
        record_reward(resptimeFile, sum(seg["resptime"]))
        record_reward(pktnumFile, sum(seg["pktnum"]))
        print("total rewards for Iteration %s: %s" % (iters_so_far, sum(seg["rew"])))
        print("average response time: %s, num of pkts: %s" % (sum(seg["resptime"])/sum(seg["pktnum"]), sum(seg["pktnum"])))
        prob = collections.Counter(seg["ac"])  # a dict where elements are stored as dictionary keys and their counts are stored as dictionary values.
        for key in prob:
            prob[key] = prob[key]/len(seg["ac"])
        print("percentage of choosing each controller: %s" % (prob))

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        vf_ob, ac_ob, ac, atarg, tdlamret = seg["vf_ob"], seg['ac_ob'], seg["ac"], seg["adv"], seg["tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()) / atarg.std()  # standardized advantage function estimate
        d = Dataset(dict(vf_ob=vf_ob, ac_ob=ac_ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or vf_ob.shape[0]

        # if hasattr(pi, "vf_ob_rms"): pi.vf_ob_rms.update(vf_ob)  # update running mean/std for policy
        # if hasattr(pi, "nn_in_rms"):
        #     temp = ac_ob.reshape(-1,ac_ob.shape[2])
        #     pi.nn_in_rms.update(temp)

        assign_old_eq_new()  # set old parameter values to new parameter values
        logger.log("Optimizing...")
        logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = []  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                # calculate the value function gradient
                vf_g = vf_grad(batch["vf_ob"], batch["vtarg"])
                vf_adam.update(vf_g, optim_stepsize * cur_lrmult)

                # calculate the policy gradient
                pol_g = []
                ratios = []
                pbs_new_batch = []
                pbs_old_batch = []
                e = clip_para(cur_lrmult)[0]
                for sample_id in range(optim_batchsize):
                    sample_ac_ob = batch["ac_ob"][sample_id]
                    sample_ac = batch["ac"][sample_id]
                    probs_new = pi.calculate_ac_prob(sample_ac_ob)
                    prob_new = probs_new[sample_ac]
                    probs_old = oldpi.calculate_ac_prob(sample_ac_ob)
                    prob_old = probs_old[sample_ac]
                    if prob_old == 0:
                        logger.error("pi_old = 0 in %s th iteration %s th epoch %s th sample..." % (iters_so_far, _, sample_id))
                    r = prob_new / prob_old
                    ratios.append(r)
                    pbs_new_batch.append(probs_new)
                    pbs_old_batch.append(probs_old)
                    if (r > 1.0 + e and batch["atarg"][sample_id] > 0) or (r < 1.0 - e and batch["atarg"][sample_id] < 0) or r == 0:
                        dnn_dtheta = pol_nn_grad(sample_ac_ob[0].reshape(1, -1))
                        pol_g.append(0.*dnn_dtheta)
                    else:
                        nn = pi.calculate_ac_value(sample_ac_ob)
                        denominator = np.power(sum(nn), 2)
                        sorted_ind = np.argsort(nn)  # sort the array in ascending order
                        if len(probs_new) == 2:
                            if sample_ac == 0:
                                numerator1 = nn[1]*pol_nn_grad(sample_ac_ob[0].reshape(1,-1))
                                numerator2 = nn[0] * pol_nn_grad(sample_ac_ob[1].reshape(1, -1))
                                dpi_dtheta = -(numerator1-numerator2)/denominator
                            else:
                                numerator1 = nn[1]*pol_nn_grad(sample_ac_ob[0].reshape(1,-1))
                                numerator2 = nn[0]*pol_nn_grad(sample_ac_ob[1].reshape(1,-1))
                                dpi_dtheta = -(numerator2 - numerator1)/denominator

                            # numerator1 = nn[sorted_ind[0]]*pol_nn_grad(sample_ac_ob[sorted_ind[1]].reshape(1,-1))
                            # numerator2 = nn[sorted_ind[1]]*pol_nn_grad(sample_ac_ob[sorted_ind[0]].reshape(1,-1))
                            # dpi_dtheta = (numerator1-numerator2)/denominator

                        elif len(probs_new) == 3:
                            if sample_ac == sorted_ind[0]:
                                # the controller with lowest probability will still possible to be chosen because the probability is not zero
                                dnn_dtheta = pol_nn_grad(sample_ac_ob[0].reshape(1, -1))
                                pol_g.append(0. * dnn_dtheta)
                            else:
                                numerator1 = sum(nn) * (pol_nn_grad(sample_ac_ob[sample_ac].reshape(1,-1)) + 0.5 * pol_nn_grad(
                                    sample_ac_ob[sorted_ind[0]].reshape(1, -1)))
                                numerator2 = (nn[sample_ac] + 0.5 * nn[sorted_ind[0]]) * pol_nn_grad(sample_ac_ob)
                                dpi_dtheta = -(numerator1 - numerator2) / denominator
                        else:
                            if sample_ac == sorted_ind[-1] or sample_ac == sorted_ind[-2]:
                                numerator1 = sum(nn) * (pol_nn_grad(sample_ac_ob[sample_ac] .reshape(1,-1))+0.5*pol_nn_grad(sample_ac_ob[sorted_ind[0:-2]]))
                                numerator2 = (nn[sample_ac]+0.5*sum(nn[sorted_ind[0:-2]])) * pol_nn_grad(sample_ac_ob)
                                dpi_dtheta = -(numerator1 - numerator2) / denominator
                            else:
                                dnn_dtheta = pol_nn_grad(sample_ac_ob[0].reshape(1, -1))
                                pol_g.append(0. * dnn_dtheta)
                        pol_g.append(batch["atarg"][sample_id] * dpi_dtheta / prob_old)

                pol_g_mean = np.mean(np.array(pol_g), axis=0)
                pol_adam.update(pol_g_mean, optim_stepsize * cur_lrmult)

                newlosses = compute_losses(batch["vf_ob"], batch["atarg"], batch["vtarg"],
                                           cur_lrmult, np.array(ratios), np.array(pbs_new_batch), np.array(pbs_old_batch))

                # adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        # losses = []
        # for batch in d.iterate_once(optim_batchsize):
        #     newlosses = compute_losses(batch["vf_ob"], batch["ac_ob"], batch["ac"], batch["atarg"], batch["vtarg"],
        #                                cur_lrmult)
        #     losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        if len(lenbuffer) == 0:
            logger.record_tabular("EpLenMean", 0)
            logger.record_tabular("EpRewMean", 0)
        else:
            logger.record_tabular("EpLenMean", np.mean(lenbuffer))
            logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()
Exemple #12
0
        def callback(locals, globals):
            if that.method != "ddpg":
                if load_policy is not None and locals[iter_name] == 0:
                    # noinspection PyBroadException
                    try:
                        utils.load_state(load_policy)
                        if MPI.COMM_WORLD.Get_rank() == 0:
                            logger.info("Loaded policy network weights from %s." % load_policy)
                            # save TensorFlow summary (contains at least the graph definition)
                    except:
                        logger.error("Failed to load policy network weights from %s." % load_policy)
                if MPI.COMM_WORLD.Get_rank() == 0 and locals[iter_name] == 0:
                    _ = tf.summary.FileWriter(folder, tf.get_default_graph())
            if MPI.COMM_WORLD.Get_rank() == 0 and locals[iter_name] % save_every == 0:
                print('Saving video and checkpoint for policy at iteration %i...' %
                      locals[iter_name])
                ob = env.reset()
                images = []
                rewards = []
                max_reward = 1.  # if any reward > 1, we have to rescale
                lower_part = video_height // 5
                for i in range(episode_length):
                    if that.method == "ddpg":
                        ac, _ = locals['agent'].pi(ob, apply_noise=False, compute_Q=False)
                    elif that.method == "sql":
                        ac, _ = locals['policy'].get_action(ob)
                    elif isinstance(locals['pi'], GaussianMlpPolicy):
                        ac, _, _ = locals['pi'].act(np.concatenate((ob, ob)))
                    else:
                        ac, _ = locals['pi'].act(False, ob)
                    ob, rew, new, _ = env.step(ac)
                    images.append(render_frames(env))
                    if plot_rewards:
                        rewards.append(rew)
                        max_reward = max(rew, max_reward)
                    if new:
                        break

                orange = np.array([255, 163, 0])
                red = np.array([255, 0, 0])
                video = []
                width_factor = 1. / episode_length * video_width
                for i, imgs in enumerate(images):
                    for img in imgs:
                        img[-lower_part, :10] = orange
                        img[-lower_part, -10:] = orange
                        if episode_length < video_width:
                            p_rew_x = 0
                            for j, r in enumerate(rewards[:i]):
                                rew_x = int(j * width_factor)
                                if r < 0:
                                    img[-1:, p_rew_x:rew_x] = red
                                    img[-1:, p_rew_x:rew_x] = red
                                else:
                                    rew_y = int(r / max_reward * lower_part)
                                    img[-rew_y - 1:, p_rew_x:rew_x] = orange
                                    img[-rew_y - 1:, p_rew_x:rew_x] = orange
                                p_rew_x = rew_x
                        else:
                            for j, r in enumerate(rewards[:i]):
                                rew_x = int(j * width_factor)
                                if r < 0:
                                    img[-1:, rew_x] = red
                                    img[-1:, rew_x] = red
                                else:
                                    rew_y = int(r / max_reward * lower_part)
                                    img[-rew_y - 1:, rew_x] = orange
                                    img[-rew_y - 1:, rew_x] = orange
                    video.append(np.hstack(imgs))

                imageio.mimsave(
                    os.path.join(folder, "videos", "%s_%s_iteration_%i.mp4" %
                                 (that.environment, that.method, locals[iter_name])),
                    video,
                    fps=60)
                env.reset()

                if that.method != "ddpg":
                    utils.save_state(os.path.join(that.folder, "checkpoints", "%s_%i" %
                                                 (that.environment, locals[iter_name])))
Exemple #13
0
def main(_):
    config = tf.ConfigProto()
    config.device_count['GPU'] = 1
    config.gpu_options.allow_growth = True
    config.intra_op_parallelism_threads = 1
    config.inter_op_parallelism_threads = 1

    config_name = FLAGS.config_name
    env_str, trainer_str, other_str = config_name.split('.')
    other_kwargs = get_other_args(other_str)

    env_kwargs = SocialNetworkGraphEnv.get_env_args(env_str)

    if env_kwargs['method_name'] == 'dqnV0':
        trainer_kwargs, trainer_name = get_dqn_v0_args(trainer_str), 'dqnV0'
    elif env_kwargs['method_name'] == 'randomV0':
        trainer_kwargs, trainer_name = get_random_v0_args(
            trainer_str), 'randomV0'
    elif env_kwargs['method_name'] == 'exhaustiveV0':
        trainer_kwargs, trainer_name = get_exhaustive_v0_args(
            trainer_str), 'exhaustiveV0'
    elif env_kwargs['method_name'] == 'greedyV0':
        trainer_kwargs, trainer_name = get_greedy_v0_args(
            trainer_str), 'greedyV0'
    else:
        assert (False)

    logdir = FLAGS.logdir_prefix + FLAGS.config_name + FLAGS.logdir_suffix
    logger.configure(logdir)
    logger.error('env_kwargs: ', env_kwargs)
    logger.error('other_kwargs: ', other_kwargs)
    logger.error('%s_kwargs: ' % (trainer_name), trainer_kwargs)

    tf.set_random_seed(other_kwargs['seed'])
    random.seed(other_kwargs['seed'])
    np.random.seed(other_kwargs['seed'])

    if env_kwargs['method_name'] == 'dqnV0':
        dqnTrainer = DQNTrainer(
            env_name=other_kwargs['social_network_graph_env'],
            env_kwargs=env_kwargs,
            trainer_kwargs=trainer_kwargs,
            other_kwargs=other_kwargs,
            logdir=logdir)
        dqnTrainer.train(config=config)
    elif env_kwargs['method_name'] == 'randomV0':
        randomTrainer = RandomTrainer(
            env_name=other_kwargs['social_network_graph_env'],
            env_kwargs=env_kwargs,
            trainer_kwargs=trainer_kwargs,
            other_kwargs=other_kwargs,
            logdir=logdir)
        randomTrainer.train()
    elif env_kwargs['method_name'] == 'exhaustiveV0':
        exhaustiveTrainer = ExhaustiveTrainer(
            env_name=other_kwargs['social_network_graph_env'],
            env_kwargs=env_kwargs,
            trainer_kwargs=trainer_kwargs,
            other_kwargs=other_kwargs,
            logdir=logdir)

        exhaustiveTrainer.train()
    elif env_kwargs['method_name'] == 'greedyV0':
        greedyTrainer = GreedyTrainer(
            env_name=other_kwargs['social_network_graph_env'],
            env_kwargs=env_kwargs,
            trainer_kwargs=trainer_kwargs,
            other_kwargs=other_kwargs,
            logdir=logdir)
        greedyTrainer.train()
    else:
        assert (False)