Esempio n. 1
0
    def train(self, num_epochs=None):
        num_epochs = self.num_epochs if num_epochs is None else num_epochs

        if num_epochs > 0:
            max_acc = 0
            for i in range(num_epochs):
                self.model.fit(
                    self.dataset_train,
                    epochs=1,
                    steps_per_epoch=self.train_steps_per_epoch,
                    callbacks=self.callbacks
                )

                valid_info = self.model.evaluate(self.dataset_valid, steps=self.valid_steps_per_epoch)

                valid_loss, valid_acc = valid_info[0], valid_info[1]*100

                max_acc = max(max_acc, valid_acc)
                logger.info(jm(epoch=i, validation_loss=valid_loss, validation_acc=float(valid_acc)))
            logger.info(jm(type='result', acc=float(max_acc)))
            return max_acc
        elif num_epochs == 0:
            valid_info = self.model.evaluate(self.dataset_valid, steps=self.valid_steps_per_epoch)

            valid_loss, valid_acc = valid_info[0], valid_info[1]*100

            logger.info(jm(epoch=0, validation_loss=valid_loss, validation_acc=float(valid_acc)))
            logger.info(jm(type='result', acc=float(valid_acc)))
            return valid_acc
        else:
            raise RuntimeError(f'Number of epochs should be >= 0: {num_epochs}')
    def train(self, num_epochs=None):
        num_epochs = self.num_epochs if num_epochs is None else num_epochs

        if num_epochs > 0:
            max_rmetric = -math.inf
            for i in range(num_epochs):
                self.model.fit(self.dataset_train,
                               epochs=1,
                               steps_per_epoch=self.train_steps_per_epoch,
                               callbacks=self.callbacks)

                y_orig, y_pred = self.predict()

                try:
                    unnormalize_rmetric = self.sess.run(
                        self.reward_metric_op, {
                            self.y_true_ph: y_orig,
                            self.y_pred_ph: y_pred
                        })
                    if len(np.shape(unnormalize_rmetric)) > 1:
                        unnormalize_rmetric = np.mean(unnormalize_rmetric)
                except ValueError as err:
                    logger.error(traceback.format_exc())
                    unnormalize_rmetric = np.finfo('float32').min
                except:
                    raise

                max_rmetric = max(max_rmetric, unnormalize_rmetric)
                logger.info(jm(epoch=i, rmetric=float(unnormalize_rmetric)))

            logger.info(jm(type='result', rmetric=float(max_rmetric)))
            return max_rmetric

        elif num_epochs == 0:
            y_orig, y_pred = self.predict()

            try:
                unnormalize_rmetric = self.sess.run(self.reward_metric_op, {
                    self.y_true_ph: y_orig,
                    self.y_pred_ph: y_pred
                })
                if len(np.shape(unnormalize_rmetric)) > 1:
                    unnormalize_rmetric = np.mean(unnormalize_rmetric)
            except ValueError as err:
                logger.error(traceback.format_exc())
                unnormalize_rmetric = np.finfo('float32').min
            except:
                raise

            logger.info(jm(epoch=0, rmetric=float(unnormalize_rmetric)))
            logger.info(jm(type='result', rmetric=float(unnormalize_rmetric)))
            return unnormalize_rmetric
        else:
            raise RuntimeError(
                f'Number of epochs should be >= 0: {num_epochs}')
Esempio n. 3
0
    def train(self, num_epochs=None):
        num_epochs = self.num_epochs if num_epochs is None else num_epochs

        if num_epochs > 0:
            min_mse = math.inf
            for i in range(num_epochs):
                self.model.fit(self.dataset_train,
                               epochs=1,
                               steps_per_epoch=self.train_steps_per_epoch,
                               callbacks=self.callbacks)

                y_orig, y_pred = self.predict()

                try:
                    unnormalize_mse = mean_squared_error(y_orig, y_pred)
                except ValueError as err:
                    logger.error(traceback.format_exc())
                    unnormalize_mse = np.finfo('float32').max
                except:
                    raise

                # self.train_history[f'{self.metrics_name[0]}_valid'] = unnormalize_mse

                min_mse = min(min_mse, unnormalize_mse)
                logger.info(jm(epoch=i, validation_mse=float(unnormalize_mse)))

            logger.info(jm(type='result', mse=float(min_mse)))
            return min_mse
        elif num_epochs == 0:
            y_orig, y_pred = self.predict()

            try:
                unnormalize_mse = mean_squared_error(y_orig, y_pred)
            except ValueError as err:
                logger.error(traceback.format_exc())
                unnormalize_mse = np.finfo('float32').max
            except:
                raise

            logger.info(jm(epoch=0, validation_mse=float(unnormalize_mse)))
            logger.info(jm(type='result', mse=float(unnormalize_mse)))
            return unnormalize_mse
        else:
            raise RuntimeError(
                f'Number of epochs should be >= 0: {num_epochs}')
    def train(self, num_epochs=None):
        num_epochs = self.num_epochs if num_epochs is None else num_epochs

        min_mse = math.inf
        for i in range(num_epochs):
            self.model.fit(self.dataset_train,
                           epochs=1,
                           steps_per_epoch=self.train_steps_per_epoch,
                           callbacks=self.callbacks)

            y_orig, y_pred = self.predict()

            unnormalize_mse = mean_squared_error(y_orig, y_pred)

            self.train_history[
                f'{self.metrics_name[0]}_valid'] = unnormalize_mse

            min_mse = min(min_mse, unnormalize_mse)
            logger.info(jm(epoch=i, validation_mse=float(unnormalize_mse)))

        logger.info(jm(type='result', mse=float(min_mse)))
        return min_mse
    def train(self, num_epochs=None):
        num_epochs = self.num_epochs if num_epochs is None else num_epochs

        max_acc = 0
        for i in range(num_epochs):
            self.model.fit(self.dataset_train,
                           epochs=1,
                           steps_per_epoch=self.train_steps_per_epoch,
                           callbacks=self.callbacks)

            valid_info = self.model.evaluate(self.dataset_valid,
                                             steps=self.valid_steps_per_epoch)

            valid_loss, valid_acc = valid_info[0], valid_info[1] * 100

            max_acc = max(max_acc, valid_acc)
            logger.info(
                jm(epoch=i,
                   validation_loss=valid_loss,
                   validation_acc=float(valid_acc)))
        logger.info(jm(type='result', acc=float(max_acc)))
        return max_acc
Esempio n. 6
0
def train(num_episodes, seed, space, evaluator, num_episodes_per_batch):

    rank = MPI.COMM_WORLD.Get_rank()
    if rank == 0:  # rank zero simule the use of a parameter server
        pass
    else:
        workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank(
        ) if seed is not None else None
        set_global_seeds(workerseed)

        # MAKE ENV_NAS
        structure = space['create_structure']['func'](
            **space['create_structure']['kwargs'])

        num_nodes = structure.num_nodes
        timesteps_per_actorbatch = num_nodes * num_episodes_per_batch
        num_timesteps = timesteps_per_actorbatch * num_episodes

        max_timesteps = num_timesteps
        timesteps_per_actorbatch = timesteps_per_actorbatch

        env = NasEnv(space, evaluator, structure)

        seg_gen = traj_segment_generator(env, timesteps_per_actorbatch)

        timesteps_so_far = 0
        iters_so_far = 0

        cond = sum([max_timesteps > 0])
        assert cond == 1, f"Only one time constraint permitted: cond={cond}, max_timesteps={max_timesteps}"

        while True:
            if max_timesteps and timesteps_so_far >= max_timesteps:
                break

            logger.log("********** Iteration %i ************" % iters_so_far)

            seg = seg_gen.__next__()
            dh_logger.info(
                jm(type='seg', rank=MPI.COMM_WORLD.Get_rank(), **seg))
            iters_so_far += 1

        env.close()
    def step_wait(self):
        obs = [
            np.array([float(action_seq[-1])])
            for action_seq in self.action_buffers
        ]

        if len(self.action_buffers[0]) < self.num_actions_per_env:
            # Results are already known here...
            rews = [0 for _ in self.action_buffers]
            dones = [False for _ in self.action_buffers]
            infos = {}
        else:
            # Waiting results from balsam
            results = self.evaluator.await_evals(
                self.eval_uids)  # Not blocking
            rews = [rew for cfg, rew in results]  # Blocking generator

            self.stats['batch_computation'] = time.time() - \
                self.stats['batch_computation']
            self.stats['num_cache_used'] = self.evaluator.stats[
                'num_cache_used']
            self.stats['rank'] = MPI.COMM_WORLD.Get_rank(
            ) if MPI is not None else 0

            dones = [True for _ in rews]
            infos = [{
                'episode': {
                    'r': r,
                    'l': self.num_actions_per_env
                }
                for r in rews
            }]  # TODO

            self.stats['rewards'] = rews
            self.stats['arch_seq'] = self.action_buffers

            dhlogger.info(jm(type='env_stats', **self.stats))

            self.reset()

        return np.stack(obs), np.array(rews), np.array(dones), infos
Esempio n. 8
0
def learn(
    env,
    policy_fn,
    *,
    timesteps_per_actorbatch,  # timesteps per actor per update
    clip_param,
    entcoeff,  # clipping parameter epsilon, entropy coeff
    optim_epochs,
    optim_stepsize,
    optim_batchsize,  # optimization hypers
    gamma,
    lam,  # advantage estimation
    max_timesteps=0,
    max_episodes=0,
    max_iters=0,
    max_seconds=0,  # time constraint
    callback=None,  # you can do anything in the callback, since it takes locals(), globals()
    adam_epsilon=1e-5,
    schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
    reward_rule=reward_for_final_timestep):

    rank = MPI.COMM_WORLD.Get_rank()

    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space,
                   ac_space)  # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                             1.0 + clip_param) * atarg  #
    pol_surr = -tf.reduce_mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdamAsync(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()

    t1 = time.time()
    ##
    adam.sync()
    ##
    t2 = time.time()
    t = t2 - t1
    dh_logger.info(
        jm(type='adam.sync', rank=rank, duration=t, start_time=t1,
           end_time=t2))

    if rank == 0:  # ZERO is the parameter server
        while True:
            t1 = time.time()
            ## BEGIN - TIMING ##
            rank_worker_source = adam.master_update()
            ## END - TIMING ##
            t2 = time.time()
            t = t2 - t1
            dh_logger.info(
                jm(type='adam.master_update',
                   rank=rank,
                   duration=t,
                   rank_worker_source=rank_worker_source,
                   start_time=t1,
                   end_time=t2))
    else:
        # Prepare for rollouts
        # ----------------------------------------

        seg_gen = traj_segment_generator(pi,
                                         env,
                                         timesteps_per_actorbatch,
                                         stochastic=True,
                                         reward_affect_func=reward_rule)

        episodes_so_far = 0
        timesteps_so_far = 0
        iters_so_far = 0
        tstart = time.time()
        lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
        rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

        cond = sum([
            max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0
        ])
        assert cond == 1, f"Only one time constraint permitted: cond={cond}, max_iters={max_iters}, max_timesteps={max_timesteps}, max_episodes={max_episodes}, max_seconds={max_seconds}"

        while True:
            if callback: callback(locals(), globals())
            if max_timesteps and timesteps_so_far >= max_timesteps:
                break
            elif max_episodes and episodes_so_far >= max_episodes:
                break
            elif max_iters and iters_so_far >= max_iters:
                break
            elif max_seconds and time.time() - tstart >= max_seconds:
                break

            if schedule == 'constant':
                cur_lrmult = 1.0
            elif schedule == 'linear':
                cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps,
                                 0)
            else:
                raise NotImplementedError

            logger.log("********** Iteration %i ************" % iters_so_far)

            t1 = time.time()
            ## BEGIN - TIMING ##
            seg = seg_gen.__next__()
            ## END - TIMING ##
            t2 = time.time()
            t = t2 - t1
            dh_logger.info(
                jm(type='batch_computation',
                   rank=rank,
                   duration=t,
                   start_time=t1,
                   end_time=t2))
            dh_logger.info(jm(type='seg', rank=rank, **seg))

            add_vtarg_and_adv(seg, gamma, lam)

            # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
            ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
                "tdlamret"]
            vpredbefore = seg[
                "vpred"]  # predicted value function before udpate
            atarg = (atarg - atarg.mean()) / atarg.std(
            )  # standardized advantage function estimate
            d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                        shuffle=not pi.recurrent)
            # optim_batchsize = optim_batchsize or ob.shape[0]
            optim_batchsize = ob.shape[0]

            if hasattr(pi, "ob_rms"):
                pi.ob_rms.update(ob)  # update running mean/std for policy

            assign_old_eq_new(
            )  # set old parameter values to new parameter values
            dh_logger.info(f"Rank={rank}: Optimizing...")

            # Here we do a bunch of optimization epochs over the data
            for _ in range(optim_epochs):
                losses = [
                ]  # list of tuples, each of which gives the loss for a minibatch
                for batch in d.iterate_once(optim_batchsize):
                    *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                                batch["atarg"], batch["vtarg"],
                                                cur_lrmult)

                    t1 = time.time()
                    ## BEGIN - TIMING ##
                    adam.worker_update(g, optim_stepsize * cur_lrmult)
                    ## END - TIMING ##
                    t2 = time.time()
                    t = t2 - t1
                    dh_logger.info(
                        jm(type='adam.worker_update',
                           rank=rank,
                           duration=t,
                           start_time=t1,
                           end_time=t2))

                    losses.append(newlosses)

            dh_logger.info(f"Rank={rank}: Evaluating losses...")
            losses = []
            for batch in d.iterate_once(optim_batchsize):
                newlosses = compute_losses(batch["ob"], batch["ac"],
                                           batch["atarg"], batch["vtarg"],
                                           cur_lrmult)
                losses.append(newlosses)
            meanlosses, _, _ = mpi_moments(losses, axis=0, use_mpi=False)

            lens = seg["ep_lens"]
            rews = seg["ep_rets"]

            episodes_so_far += len(lens)
            timesteps_so_far += sum(lens)
            iters_so_far += 1

        return pi