Example #1
0
 def __init__(self, env, monitor_path, video=True, **usercfg):
     super(KarpathyCNN, self).__init__(**usercfg)
     self.env = wrappers.Monitor(env,
                                 monitor_path,
                                 force=True,
                                 video_callable=(None if video else False))
     self.nA = env.action_space.n
     self.monitor_path = monitor_path
     # Default configuration. Can be overwritten using keyword arguments.
     self.config.update(
         dict(
             # timesteps_per_batch=10000,
             # n_iter=100,
             n_hidden_units=200,
             learning_rate=1e-3,
             batch_size=
             10,  # Amount of episodes after which to adapt gradients
             gamma=0.99,  # Discount past rewards by a percentage
             decay=0.99,  # Decay of RMSProp optimizer
             epsilon=1e-9,  # Epsilon of RMSProp optimizer
             draw_frequency=50  # Draw a plot every 50 episodes
         ))
     self.config.update(usercfg)
     self.build_network()
     if self.config["save_model"]:
         tf.add_to_collection("action", self.action)
         tf.add_to_collection("states", self.states)
         self.saver = FastSaver()
Example #2
0
    def __init__(self, envs, monitor_path, **usercfg):
        super(KnowledgeTransfer, self).__init__(**usercfg)
        self.envs = envs
        self.n_tasks = len(envs)
        self.monitor_path = monitor_path
        self.nA = envs[0].action_space.n
        self.config.update(
            dict(
                timesteps_per_batch=10000,
                trajectories_per_batch=10,
                batch_update="timesteps",
                n_iter=100,
                switch_at_iter=None,
                gamma=0.99,  # Discount past rewards by a percentage
                decay=0.9,  # Decay of RMSProp optimizer
                epsilon=1e-9,  # Epsilon of RMSProp optimizer
                learning_rate=0.005,
                n_hidden_units=10,
                repeat_n_actions=1,
                n_sparse_units=10,
                feature_extraction=False))
        self.config.update(usercfg)

        self.build_networks()
        self.task_runners = [
            EnvRunner(envs[i], TaskPolicy(action, self), self.config)
            for i, action in enumerate(self.action_tensors)
        ]
        if self.config["save_model"]:
            for action_tensor in self.action_tensors:
                tf.add_to_collection("action", action_tensor)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()
Example #3
0
    def __init__(self,
                 env_id: str,
                 task_id: int,
                 cluster: tf.train.ClusterDef,
                 monitor_path: str,
                 config: dict,
                 clip_gradients: bool = True,
                 video: bool = False,
                 seed: Optional[int] = None) -> None:
        super(A3CTask, self).__init__()
        self.task_id = task_id
        self.config = config
        self.clip_gradients = clip_gradients
        self.env = make(env_id)
        self.env.seed(seed)
        if task_id == 0:
            self.env = wrappers.Monitor(
                self.env,
                monitor_path,
                force=True,
                video_callable=(None if video else False)
            )

        # Only used (and overwritten) by agents that use an RNN
        self.initial_features = None

        worker_device = "/job:worker/task:{}/cpu:0".format(task_id)
        # Global network
        shared_device = tf.train.replica_device_setter(
            ps_tasks=1,
            worker_device=worker_device,
            cluster=cluster)
        with tf.device(shared_device):
            with tf.variable_scope("global"):
                self.global_network = self.build_networks()
                self.global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)
                self._global_step = tf.get_variable(
                    "global_step",
                    [],
                    tf.int32,
                    initializer=tf.constant_initializer(0, dtype=tf.int32),
                    trainable=False)

        # Local network
        with tf.device(worker_device):
            with tf.variable_scope("local"):
                self.local_network = self.build_networks()
                self.states = self.local_network.states
                self.actions_taken = self.local_network.actions_taken
                self.advantage = tf.placeholder(tf.float32, [None], name="advantage")
                self.ret = tf.placeholder(tf.float32, [None], name="return")
                self.actor_loss, self.critic_loss, self.loss = self.make_loss()
                self.local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)
                self.sync_net = create_sync_net_op(self.global_vars, self.local_vars)
                self.n_steps = tf.shape(self.local_network.states)[0]
                inc_step = self._global_step.assign_add(self.n_steps)

        device = shared_device if self.config["shared_optimizer"] else worker_device
        with tf.device(device):
            apply_optim_op = self.make_trainer()
            self.train_op = tf.group(apply_optim_op, inc_step)

            loss_summaries = self.create_summary_losses()
            self.reward = tf.placeholder("float", name="reward")
            tf.summary.scalar("Reward", self.reward)
            self.episode_length = tf.placeholder("float", name="episode_length")
            tf.summary.scalar("Episode_length", self.episode_length)
            self.summary_op = tf.summary.merge(loss_summaries)

        variables_to_save = [v for v in tf.global_variables() if not v.name.startswith("local")]
        init_op = tf.variables_initializer(variables_to_save)
        init_all_op = tf.global_variables_initializer()
        saver = FastSaver(variables_to_save)
        # Write the summary of each task in a different directory
        self.writer = tf.summary.FileWriter(os.path.join(monitor_path, "task{}".format(task_id)))

        self.runner = RunnerThread(self.env, self, int(self.config["n_local_steps"]), task_id == 0 and video)

        self.server = tf.train.Server(
            cluster,
            job_name="worker",
            task_index=task_id,
            config=tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=2)
        )

        def init_fn(scaffold, sess):
            sess.run(init_all_op)

        self.report_uninit_op = tf.report_uninitialized_variables(variables_to_save)

        self.scaffold = tf.train.Scaffold(
            init_op=init_op,
            init_fn=init_fn,
            ready_for_local_init_op=self.report_uninit_op,
            saver=saver,
            ready_op=self.report_uninit_op
        )

        self.config_proto = tf.ConfigProto(device_filters=["/job:ps", "/job:worker/task:{}/cpu:0".format(task_id)])

        self.session = None
Example #4
0
    def __init__(self,
                 env,
                 monitor_path: str,
                 monitor: bool = False,
                 video: bool = True,
                 **usercfg) -> None:
        super(REINFORCE, self).__init__(**usercfg)
        self.env = env
        if monitor:
            self.env = wrappers.Monitor(
                self.env,
                monitor_path,
                force=True,
                video_callable=(None if video else False))
        self.monitor_path = monitor_path
        # Default configuration. Can be overwritten using keyword arguments.
        self.config.update(
            dict(
                batch_update="timesteps",
                timesteps_per_batch=1000,
                n_iter=100,
                gamma=0.99,  # Discount past rewards by a percentage
                learning_rate=0.05,
                entropy_coef=1e-3,
                n_hidden_layers=2,
                n_hidden_units=20,
                repeat_n_actions=1,
                save_model=False))
        self.config.update(usercfg)

        self.states = tf.placeholder(tf.float32, [None] +
                                     list(self.env.observation_space.shape),
                                     name="states")  # Observation
        self.actions_taken = tf.placeholder(
            tf.float32, name="actions_taken")  # Discrete action
        self.advantage = tf.placeholder(tf.float32,
                                        name="advantage")  # Advantage
        self.build_network()
        self.make_trainer()

        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()
        summary_loss = tf.summary.scalar("model/loss", self.summary_loss)
        summaries = [summary_loss]
        if hasattr(self, "entropy"):
            summary_entropy = tf.summary.scalar("model/entropy", self.entropy)
            summaries += [summary_entropy]
        self.summary_op = tf.summary.merge(summaries)

        self.init_op = tf.global_variables_initializer()
        # Launch the graph.
        num_cpu = multiprocessing.cpu_count()
        tf_config = tf.ConfigProto(allow_soft_placement=True,
                                   inter_op_parallelism_threads=num_cpu,
                                   intra_op_parallelism_threads=num_cpu)
        self.session = tf.Session(config=tf_config)
        self.writer = tf.summary.FileWriter(
            os.path.join(self.monitor_path, "task0"), self.session.graph)

        self.env_runner = EnvRunner(self.env,
                                    self,
                                    usercfg,
                                    summary_writer=self.writer)
Example #5
0
    def __init__(self, env, monitor_path: str, video: bool = True, **usercfg) -> None:
        super(A2C, self).__init__(**usercfg)
        self.monitor_path = monitor_path

        self.env = wrappers.Monitor(
            env,
            monitor_path,
            force=True,
            video_callable=(None if video else False))

        self.config.update(dict(
            n_iter=100,
            gamma=0.99,
            learning_rate=0.001,
            n_hidden_units=20,
            n_hidden_layers=1,
            gradient_clip_value=0.5,
            n_local_steps=20,
            vf_coef=0.5,
            entropy_coef=0.01,
            loss_reducer="mean",
            save_model=False
        ))
        self.config.update(usercfg)
        # Only used (and overwritten) by agents that use an RNN
        self.initial_features = None
        self.ac_net = None  # Overwritten by build_networks
        self.build_networks()

        self.action = self.ac_net.action
        self.states = self.ac_net.states
        self.actions_taken = self.ac_net.actions_taken
        self.advantage = tf.placeholder(tf.float32, [None], name="advantage")
        self.ret = tf.placeholder(tf.float32, [None], name="return")

        self.actor_loss, self.critic_loss, self.loss = self.make_loss()

        self.vars = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)

        self._global_step = tf.get_variable(
            "global_step",
            [],
            tf.int32,
            initializer=tf.constant_initializer(0, dtype=tf.int32),
            trainable=False)

        self.optimizer = tf.train.AdamOptimizer(
            self.config["learning_rate"], name="optim")
        grads = tf.gradients(self.loss, self.vars)
        grads, _ = tf.clip_by_global_norm(
            grads, self.config["gradient_clip_value"])

        # Apply gradients to the weights of the master network
        apply_grads = self.optimizer.apply_gradients(zip(grads, self.vars))

        self.n_steps = tf.shape(self.states)[0]
        inc_step = self._global_step.assign_add(self.n_steps)
        self.train_op = tf.group(apply_grads, inc_step)

        self.init_op = tf.global_variables_initializer()
        # Launch the graph.
        num_cpu = multiprocessing.cpu_count()
        tf_config = tf.ConfigProto(
            allow_soft_placement=True,
            inter_op_parallelism_threads=num_cpu,
            intra_op_parallelism_threads=num_cpu)
        self.session = tf.Session(config=tf_config)
        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()
        n_steps = tf.to_float(self.n_steps)
        actor_loss_summary = tf.summary.scalar("model/actor_loss", tf.squeeze(self.actor_loss / n_steps))
        critic_loss_summary = tf.summary.scalar("model/critic_loss", tf.squeeze(self.critic_loss / n_steps))
        loss_summary = tf.summary.scalar("model/loss", tf.squeeze(self.loss / n_steps))
        self.loss_summary_op = tf.summary.merge(
            [actor_loss_summary, critic_loss_summary, loss_summary])
        self.writer = tf.summary.FileWriter(os.path.join(
            self.monitor_path, "summaries"), self.session.graph)
        self.env_runner = EnvRunner(self.env, self, usercfg, summary_writer=self.writer)
        return
Example #6
0
    def __init__(self, env, monitor_path, **usercfg):
        super(DPPO, self).__init__()
        self.env = env
        self.env_name: str = env.spec.id
        self.monitor_path: str = monitor_path

        self.comm = MPI.COMM_SELF

        self.config.update(
            dict(
                n_workers=3,
                n_hidden_units=20,
                n_hidden_layers=2,
                gamma=0.99,
                gae_lambda=0.95,
                learning_rate=2.5e-4,
                n_iter=10000,
                n_epochs=4,
                n_local_steps=128,
                gradient_clip_value=0.5,
                vf_coef=0.5,
                entropy_coef=0.01,
                cso_epsilon=0.1,  # Clipped surrogate objective epsilon
                learn_method="batches",
                batch_size=64,
                save_model=False))
        self.config.update(usercfg)

        self.task_type = None  # To be filled in by subclasses

        self.n_updates: int = 0

        with tf.variable_scope("new_network"):
            self.new_network = self.build_networks()
            if self.RNN:
                self.initial_features = self.new_network.state_init
            else:
                self.initial_features = None
            self.new_network_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES,
                tf.get_variable_scope().name)
        self._global_step = tf.get_variable(
            "global_step", [],
            tf.int32,
            initializer=tf.constant_initializer(0, dtype=tf.int32),
            trainable=False)
        self.action = self.new_network.action
        self.value = self.new_network.value
        self.states = self.new_network.states
        self.actions_taken = self.new_network.actions_taken
        self.advantage = tf.placeholder(tf.float32, [None], name="advantage")
        self.ret = tf.placeholder(tf.float32, [None], name="return")

        with tf.variable_scope("old_network"):
            self.old_network = self.build_networks()
            self.old_network_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES,
                tf.get_variable_scope().name)

        self.set_old_to_new = tf.group(*[
            v1.assign(v2)
            for v1, v2 in zip(self.old_network_vars, self.new_network_vars)
        ])

        # Reduces by taking the mean instead of summing
        self.actor_loss = -tf.reduce_mean(
            self.make_actor_loss(self.old_network, self.new_network,
                                 self.advantage))
        self.critic_loss = tf.reduce_mean(tf.square(self.value - self.ret))
        self.mean_entropy = tf.reduce_mean(self.new_network.entropy)
        self.loss = self.actor_loss + self.config["vf_coef"] * self.critic_loss + \
            self.config["entropy_coef"] * self.mean_entropy

        grads = tf.gradients(self.loss, self.new_network_vars)

        self.n_steps = tf.shape(self.states)[0]
        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()
        summary_actor_loss = tf.summary.scalar("model/Actor_loss",
                                               self.actor_loss)
        summary_critic_loss = tf.summary.scalar("model/Critic_loss",
                                                self.critic_loss)
        summary_loss = tf.summary.scalar("model/Loss", self.loss)
        summary_entropy = tf.summary.scalar("model/Entropy",
                                            -self.mean_entropy)
        summary_grad_norm = tf.summary.scalar("model/grad_global_norm",
                                              tf.global_norm(grads))
        summary_var_norm = tf.summary.scalar(
            "model/var_global_norm", tf.global_norm(self.new_network_vars))
        self.model_summary_op = tf.summary.merge([
            summary_actor_loss, summary_critic_loss, summary_loss,
            summary_entropy, summary_grad_norm, summary_var_norm
        ])
        self.writer = tf.summary.FileWriter(
            os.path.join(self.monitor_path, "master"))

        # grads before clipping were passed to the summary, now clip and apply them
        if self.config["gradient_clip_value"] is not None:
            grads, _ = tf.clip_by_global_norm(
                grads, self.config["gradient_clip_value"])

        with tf.variable_scope("optimizer"):
            self.optimizer = tf.train.AdamOptimizer(
                self.config["learning_rate"], name="optim")
            apply_grads = self.optimizer.apply_gradients(
                zip(grads, self.new_network_vars))

            inc_step = self._global_step.assign_add(self.n_steps)
            self.train_op = tf.group(apply_grads, inc_step)
        optimizer_variables = [
            var for var in tf.global_variables()
            if var.name.startswith("optimizer")
        ]
        self.init_op = tf.variables_initializer(self.new_network_vars +
                                                optimizer_variables +
                                                [self._global_step])
Example #7
0
File: ppo.py Project: rkc007/yarll
    def __init__(self,
                 env,
                 monitor_path: str,
                 monitor: bool = False,
                 video: bool = False,
                 **usercfg) -> None:
        super(PPO, self).__init__(**usercfg)
        self.monitor_path: str = monitor_path
        self.env = env
        if monitor:
            self.env = wrappers.Monitor(
                self.env,
                monitor_path,
                force=True,
                video_callable=(None if video else False))

        self.config.update(
            dict(
                n_hidden_units=20,
                n_hidden_layers=2,
                gamma=0.99,
                gae_lambda=0.95,
                learning_rate=0.001,
                n_epochs=10,
                n_iter=10000,
                batch_size=64,  # Timesteps per training batch
                n_local_steps=256,
                normalize_states=False,
                gradient_clip_value=None,
                adam_epsilon=1e-5,
                vf_coef=0.5,
                entropy_coef=0.01,
                cso_epsilon=0.2,  # Clipped surrogate objective epsilon
                save_model=False))
        self.config.update(usercfg)

        with tf.variable_scope("old_network"):
            self.old_network = self.build_networks()
            self.old_network_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES,
                tf.get_variable_scope().name)

        with tf.variable_scope("new_network"):
            self.new_network = self.build_networks()
            if self.RNN:
                self.initial_features = self.new_network.state_init
            else:
                self.initial_features = None
            self.new_network_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES,
                tf.get_variable_scope().name)
        self.action = self.new_network.action
        self.value = self.new_network.value
        self.states = self.new_network.states
        self.actions_taken = self.new_network.actions_taken
        self.advantage = tf.placeholder(tf.float32, [None], name="advantage")
        self.ret = tf.placeholder(tf.float32, [None], name="return")

        self.set_old_to_new = tf.group(*[
            v1.assign(v2)
            for v1, v2 in zip(self.old_network_vars, self.new_network_vars)
        ])

        self.actor_loss = -tf.reduce_mean(
            self.make_actor_loss(self.old_network, self.new_network,
                                 self.advantage))
        self.critic_loss = tf.reduce_mean(tf.square(self.value - self.ret))
        self.mean_entropy = tf.reduce_mean(self.new_network.entropy)
        self.loss = self.actor_loss + self.config["vf_coef"] * self.critic_loss + \
            self.config["entropy_coef"] * self.mean_entropy

        grads = tf.gradients(self.loss, self.new_network_vars)

        self._global_step = tf.get_variable(
            "global_step", [],
            tf.int32,
            initializer=tf.constant_initializer(0, dtype=tf.int32),
            trainable=False)

        self.n_steps = tf.shape(self.states)[0]
        num_cpu = multiprocessing.cpu_count()
        tf_config = tf.ConfigProto(allow_soft_placement=True,
                                   inter_op_parallelism_threads=num_cpu,
                                   intra_op_parallelism_threads=num_cpu)
        self.session = tf.Session(config=tf_config)
        if self.config["save_model"]:
            tf.add_to_collection("action", self.action)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()

        summary_actor_loss = tf.summary.scalar("model/Actor_loss",
                                               self.actor_loss)
        summary_critic_loss = tf.summary.scalar("model/Critic_loss",
                                                self.critic_loss)
        summary_loss = tf.summary.scalar("model/Loss", self.loss)

        adv_mean, adv_std = tf.nn.moments(self.advantage, axes=[0])
        summary_adv_mean = tf.summary.scalar("model/advantage/mean", adv_mean)
        summary_adv_std = tf.summary.scalar("model/advantage/std",
                                            tf.sqrt(adv_std))

        # TODO: get from ppo_loss function
        # ratio_mean, ratio_std = tf.nn.moments(ratio, axes=[0])
        # summary_ratio_mean = tf.summary.scalar("model/ratio/mean", ratio_mean)
        # summary_ratio_std = tf.summary.scalar("model/ratio/std", ratio_std)

        summary_new_log_prob_mean = tf.summary.scalar(
            "model/new_log_prob/mean",
            tf.reduce_mean(self.new_network.action_log_prob))
        summary_old_log_prob_mean = tf.summary.scalar(
            "model/old_log_prob/mean",
            tf.reduce_mean(self.old_network.action_log_prob))

        ret_mean, ret_std = tf.nn.moments(self.ret, axes=[0])
        summary_ret_mean = tf.summary.scalar("model/return/mean", ret_mean)
        summary_ret_std = tf.summary.scalar("model/return/std",
                                            tf.sqrt(ret_std))
        summary_entropy = tf.summary.scalar("model/entropy",
                                            -self.mean_entropy)
        summary_grad_norm = tf.summary.scalar("model/grad_global_norm",
                                              tf.global_norm(grads))
        summary_var_norm = tf.summary.scalar(
            "model/var_global_norm", tf.global_norm(self.new_network_vars))
        summaries: List[tf.Tensor] = []
        # Weight summaries: not turned on right now because they take too much space
        # TODO: use config to make this optional
        #for v in tf.trainable_variables():
        #    if "new_network" in v.name:
        #        summaries.append(tf.summary.histogram(v.name, v))
        summaries += self._specific_summaries()
        summaries += [
            summary_actor_loss,
            summary_critic_loss,
            summary_loss,
            summary_adv_mean,
            summary_adv_std,
            # summary_ratio_mean, summary_ratio_std,
            summary_new_log_prob_mean,
            summary_old_log_prob_mean,
            summary_ret_mean,
            summary_ret_std,
            summary_entropy,
            summary_grad_norm,
            summary_var_norm
        ]
        self.model_summary_op = tf.summary.merge(summaries)
        self.writer = tf.summary.FileWriter(
            os.path.join(self.monitor_path, "summaries"), self.session.graph)
        self.env_runner = EnvRunner(
            self.env,
            self,
            usercfg,
            normalize_states=self.config["normalize_states"],
            summary_writer=self.writer)

        # grads before clipping were passed to the summary, now clip and apply them
        if self.config["gradient_clip_value"] is not None:
            grads, _ = tf.clip_by_global_norm(
                grads, self.config["gradient_clip_value"])
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate=self.config["learning_rate"],
            epsilon=self.config["adam_epsilon"],
            name="optim")
        apply_grads = self.optimizer.apply_gradients(
            zip(grads, self.new_network_vars))

        inc_step = self._global_step.assign_add(self.n_steps)
        self.train_op = tf.group(apply_grads, inc_step)

        self.init_op = tf.global_variables_initializer()
        return
Example #8
0
    def __init__(self,
                 envs,
                 monitor_path,
                 learning_method="REINFORCE",
                 **usercfg):
        super(AsyncKnowledgeTransfer, self).__init__(**usercfg)
        self.envs = envs
        self.learning_method = learning_method
        self.monitor_path = monitor_path
        self.config.update(
            dict(
                timesteps_per_batch=10000,
                trajectories_per_batch=10,
                batch_update="timesteps",
                n_iter=200,
                switch_at_iter=
                None,  # None to deactivate, otherwhise an iteration at which to switch
                gamma=0.99,  # Discount past rewards by a percentage
                decay=0.9,  # Decay of RMSProp optimizer
                epsilon=1e-9,  # Epsilon of RMSProp optimizer
                learning_rate=0.005,
                n_hidden_units=10,
                repeat_n_actions=1,
                n_task_variations=3,
                n_sparse_units=10,
                feature_extraction=False))
        self.config.update(usercfg)

        self.stop_requested = False

        self.session = tf.Session(config=tf.ConfigProto(
            log_device_placement=False, allow_soft_placement=True))

        self.global_step = tf.get_variable("global_step", [],
                                           tf.int32,
                                           initializer=tf.constant_initializer(
                                               0, dtype=tf.int32),
                                           trainable=False)
        self.build_networks()

        self.loss = tf.placeholder("float", name="loss")
        summary_loss = tf.summary.scalar("Loss", self.loss)
        self.reward = tf.placeholder("float", name="reward")
        summary_rewards = tf.summary.scalar("Reward", self.reward)
        self.episode_length = tf.placeholder("float", name="episode_length")
        summary_episode_lengths = tf.summary.scalar("Episode_length",
                                                    self.episode_length)
        self.summary_op = tf.summary.merge(
            [summary_loss, summary_rewards, summary_episode_lengths])

        self.jobs = []
        for i, env in enumerate(self.envs):
            self.jobs.append(
                self.make_thread(
                    env,
                    i,
                    self.config["switch_at_iter"]
                    if self.config["switch_at_iter"] is not None
                    and i != len(self.envs) - 1 else self.config["n_iter"],
                    start_at_iter=(0 if self.config["switch_at_iter"] is None
                                   or i != len(self.envs) - 1 else
                                   self.config["switch_at_iter"])))

        for i, job in enumerate(self.jobs):
            only_sparse = (self.config["switch_at_iter"] is not None
                           and i == len(self.jobs) - 1)
            grads = tf.gradients(
                job.loss, (self.shared_vars if not (only_sparse) else []) +
                [job.sparse_representation])
            job.apply_grad = job.optimizer.apply_gradients(
                zip(grads, (self.shared_vars if not (only_sparse) else []) +
                    [job.sparse_representation]),
                global_step=self.global_step)

        self.session.run(tf.global_variables_initializer())

        if self.config["save_model"]:
            for job in self.jobs:
                tf.add_to_collection("action", job.action)
            tf.add_to_collection("states", self.states)
            self.saver = FastSaver()