def __init__(self, env, args): #self.video_dir = os.path.join(summary_writer.get_logdir(), "../videos") #self.video_dir = os.path.abspath(args.video_dir) self.args = args self.env = env self.summary_writer = None # define environment ob_space = env.observation_space.shape ac_space = env.action_space.n worker_device = "/job:worker/task:{}/cpu:0".format(args.task) with tf.device(tf.train.replica_device_setter(1, worker_device=worker_device)): with tf.variable_scope("global"): self.network = LSTMPolicy(ob_space, ac_space) self.global_step = tf.get_variable("global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False) with tf.device(worker_device): with tf.variable_scope("local"): self.policy = pi = LSTMPolicy(ob_space, ac_space) pi.global_step = self.global_step # copy weights from the parameter server to the local model self.sync = tf.group(*[v1.assign(v2) for v1, v2 in zip(pi.var_list, self.network.var_list)])
def main(env, snapshot, visualise): env = create_env(env, client_id=0, remotes=1) with tf.variable_scope("global"): policy = LSTMPolicy(env.observation_space.shape, env.action_space.n) last_state = env.reset() # state = last_state last_features = policy.get_initial_features() length = 0 rewards = 0 variables_to_save = [ v for v in tf.global_variables() if not v.name.startswith("local") ] saver = tf.train.Saver(variables_to_save) with tf.Session() as sess: # Restore variables from disk. # saver.restore(sess, "train/model.ckpt-361814.data-00000-of-00001") # saver.restore(sess, "train/model.ckpt-361814") # saver.restore(sess, "/tmp/neonrace/train/model.ckpt-361714") saver.restore(sess, snapshot) while True: terminal_end = False fetched = policy.act(last_state, *last_features) action, value_, features = fetched[0], fetched[1], fetched[2:] # state, reward, terminal, info = env.step(action.argmax()) action_n = action.argmax() # state, reward, terminal, info = env.step(default_action) state, reward, terminal, info = env.step(action_n) if visualise: env.render() # env.render() # I need to visualize it during testing print 'length: %d, rewards: %f' % (length, rewards) length += 1 rewards += reward last_state = state last_features = features if terminal: terminal_end = True print("Episode finished. Sum of rewards: %d. Length: %d" % (rewards, length)) length = 0 rewards = 0 break
def __init__(self, env, task, visualise): self.env = env self.task = task worker_device = "/job:worker/task:{}/cpu:0".format(task) with tf.device( tf.train.replica_device_setter(1, worker_device=worker_device)): with tf.variable_scope("global"): self.network = LSTMPolicy(env.observation_space.shape, env.action_space.n) self.global_step = tf.get_variable( "global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False) with tf.device(worker_device): with tf.variable_scope("local"): self.local_network = pi = LSTMPolicy( env.observation_space.shape, env.action_space.n) pi.global_step = self.global_step
def _build_net(self): self.network = pi = LSTMPolicy(self.obs_shape, self.numaction, self.designHead) with tf.variable_scope("predictor"): self.ap_network = predictor = StateActionPredictor( self.obs_shape, self.numaction, self.designHead) self.ac = tf.placeholder(tf.float32, [None, self.numaction], name="ac") self.adv = tf.placeholder(tf.float32, [None], name="adv") self.r = tf.placeholder(tf.float32, [None], name="r") log_prob_tf = tf.nn.log_softmax(pi.logits) prob_tf = tf.nn.softmax(pi.logits) pi_loss = -tf.reduce_mean( tf.reduce_sum(log_prob_tf * self.ac, 1) * self.adv) vf_loss = 0.5 * tf.reduce_mean(tf.square(pi.vf - self.r)) entropy = -tf.reduce_mean(tf.reduce_sum(prob_tf * log_prob_tf, 1)) self.loss = pi_loss + 0.5 * vf_loss - entropy * constants[ 'ENTROPY_BETA'] # compute gradients grads = tf.gradients(self.loss, pi.var_list) # computing predictor loss self.predloss = constants['PREDICTION_LR_SCALE'] * ( predictor.invloss * (1 - constants['FORWARD_LOSS_WT']) + predictor.forwardloss * constants['FORWARD_LOSS_WT']) predgrads = tf.gradients(self.predloss, predictor.var_list) # clip gradients grads, _ = tf.clip_by_global_norm(grads, constants['GRAD_NORM_CLIP']) grads_and_vars = list(zip(grads, self.network.var_list)) predgrads, _ = tf.clip_by_global_norm(predgrads, constants['GRAD_NORM_CLIP']) pred_grads_and_vars = list(zip(predgrads, self.ap_network.var_list)) grads_and_vars = grads_and_vars + pred_grads_and_vars opt = tf.train.AdamOptimizer(constants['LEARNING_RATE']) self.train_op = tf.group(opt.apply_gradients(grads_and_vars))
def __init__(self, env, task): """ An implementation of the A3C algorithm that is reasonably well-tuned for the VNC environments. Below, we will have a modest amount of complexity due to the way TensorFlow handles data parallelism. But overall, we'll define the model, specify its inputs, and describe how the policy gradients step should be computed. """ self.env = env self.task = task worker_device = "/job:worker/task:{}/cpu:0".format(task) with tf.device( tf.train.replica_device_setter(1, worker_device=worker_device)): with tf.variable_scope("global"): self.network = LSTMPolicy(env.observation_space.shape, env.action_space.n) self.global_step = tf.get_variable( "global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False) with tf.device(worker_device): with tf.variable_scope("local"): self.local_network = pi = LSTMPolicy( env.observation_space.shape, env.action_space.n) pi.global_step = self.global_step self.ac = tf.placeholder(tf.float32, [None, env.action_space.n], name="ac") self.adv = tf.placeholder(tf.float32, [None], name="adv") self.r = tf.placeholder(tf.float32, [None], name="r") log_prob_tf = tf.nn.log_softmax(pi.logits) prob_tf = tf.nn.softmax(pi.logits) # the "policy gradients" loss: its derivative is precisely the policy gradient # notice that self.ac is a placeholder that is provided externally. # adv will contain the advantages, as calculated in process_rollout pi_loss = -tf.reduce_sum( tf.reduce_sum(log_prob_tf * self.ac, [1]) * self.adv) # loss of value function vf_loss = 0.5 * tf.reduce_sum(tf.square(pi.vf - self.r)) entropy = -tf.reduce_sum(prob_tf * log_prob_tf) bs = tf.to_float(tf.shape(pi.x)[0]) self.loss = pi_loss + 0.5 * vf_loss - entropy * 0.01 # 20 represents the number of "local steps": the number of timesteps # we run the policy before we update the parameters. # The larger local steps is, the lower is the variance in our policy gradients estimate # on the one hand; but on the other hand, we get less frequent parameter updates, which # slows down learning. In this code, we found that making local steps be much # smaller than 20 makes the algorithm more difficult to tune and to get to work. self.runner = RunnerThread(env, pi, 20) grads = tf.gradients(self.loss, pi.var_list) if use_tf12_api: tf.summary.scalar("model/policy_loss", pi_loss / bs) tf.summary.scalar("model/value_loss", vf_loss / bs) tf.summary.scalar("model/entropy", entropy / bs) tf.summary.image("model/state", pi.x) tf.summary.scalar("model/grad_global_norm", tf.global_norm(grads)) tf.summary.scalar("model/var_global_norm", tf.global_norm(pi.var_list)) self.summary_op = tf.summary.merge_all() else: tf.scalar_summary("model/policy_loss", pi_loss / bs) tf.scalar_summary("model/value_loss", vf_loss / bs) tf.scalar_summary("model/entropy", entropy / bs) tf.image_summary("model/state", pi.x) tf.scalar_summary("model/grad_global_norm", tf.global_norm(grads)) tf.scalar_summary("model/var_global_norm", tf.global_norm(pi.var_list)) self.summary_op = tf.merge_all_summaries() grads, _ = tf.clip_by_global_norm(grads, 40.0) # copy weights from the parameter server to the local model self.sync = tf.group(*[ v1.assign(v2) for v1, v2 in zip(pi.var_list, self.network.var_list) ]) grads_and_vars = list(zip(grads, self.network.var_list)) inc_step = self.global_step.assign_add(tf.shape(pi.x)[0]) # each worker has a different set of adam optimizer parameters opt = tf.train.AdamOptimizer(1e-4) self.train_op = tf.group(opt.apply_gradients(grads_and_vars), inc_step) self.summary_writer = None self.local_steps = 0
def __init__(self, gameState, task): """ An implementation of the A3C algorithm that is reasonably well-tuned for the VNC environments. Below, we will have a modest amount of complexity due to the way TensorFlow handles data parallelism. But overall, we'll define the model, specify its inputs, and describe how the policy gradients step should be computed. """ self.task = task self.gameState = gameState predictor = None numaction = ACTION_SIZE worker_device = "/job:worker/task:{}/cpu:0".format(task) with tf.device( tf.train.replica_device_setter(1, worker_device=worker_device)): with tf.variable_scope("global"): self.network = LSTMPolicy(OBSERVATION_SHAPE, numaction) self.global_step = tf.get_variable( "global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False) with tf.device(worker_device): with tf.variable_scope("local"): self.local_network = pi = LSTMPolicy(OBSERVATION_SHAPE, numaction) pi.global_step = self.global_step # Computing a3c loss: https://arxiv.org/abs/1506.02438 self.ac = tf.placeholder(tf.float32, [None, numaction], name="ac") self.adv = tf.placeholder(tf.float32, [None], name="adv") self.r = tf.placeholder(tf.float32, [None], name="r") log_prob_tf = tf.nn.log_softmax(pi.logits) prob_tf = tf.nn.softmax(pi.logits) # 1) the "policy gradients" loss: its derivative is precisely the policy gradient # notice that self.ac is a placeholder that is provided externally. # adv will contain the advantages, as calculated in process_rollout pi_loss = -tf.reduce_mean( tf.reduce_sum(log_prob_tf * self.ac, 1) * self.adv) # Eq (19) # 2) loss of value function: l2_loss = (x-y)^2/2 vf_loss = 0.5 * tf.reduce_mean( tf.square(pi.vf - self.r)) # Eq (28) # 3) entropy to ensure randomness entropy = -tf.reduce_mean(tf.reduce_sum(prob_tf * log_prob_tf, 1)) # final a3c loss: lr of critic is half of actor self.loss = pi_loss + 0.5 * vf_loss - entropy * constants[ 'ENTROPY_BETA'] # compute gradients grads = tf.gradients( self.loss * 20.0, pi.var_list ) # batchsize=20. Factored out to make hyperparams not depend on it. # computing predictor loss self.runner = RunnerThread(gameState, pi, constants['ROLLOUT_MAXLEN'], predictor) # storing summaries bs = tf.to_float(tf.shape(pi.x)[0]) if use_tf12_api: tf.summary.scalar("model/policy_loss", pi_loss) tf.summary.scalar("model/value_loss", vf_loss) tf.summary.scalar("model/entropy", entropy) tf.summary.image("model/state", pi.x) # max_outputs=10 tf.summary.scalar("model/grad_global_norm", tf.global_norm(grads)) tf.summary.scalar("model/var_global_norm", tf.global_norm(pi.var_list)) self.summary_op = tf.summary.merge_all() else: tf.scalar_summary("model/policy_loss", pi_loss) tf.scalar_summary("model/value_loss", vf_loss) tf.scalar_summary("model/entropy", entropy) tf.image_summary("model/state", pi.x) tf.scalar_summary("model/grad_global_norm", tf.global_norm(grads)) tf.scalar_summary("model/var_global_norm", tf.global_norm(pi.var_list)) self.summary_op = tf.merge_all_summaries() # clip gradients grads, _ = tf.clip_by_global_norm(grads, constants['GRAD_NORM_CLIP']) grads_and_vars = list(zip(grads, self.network.var_list)) # update global step by batch size inc_step = self.global_step.assign_add(tf.shape(pi.x)[0]) # each worker has a different set of adam optimizer parameters # TODO: make optimizer global shared, if needed print("Optimizer: Adam with lr: %f" % (constants['LEARNING_RATE'])) print("Input observation shape: ", OBSERVATION_SHAPE) opt = tf.train.RMSPropOptimizer(constants['LEARNING_RATE']) self.train_op = tf.group(opt.apply_gradients(grads_and_vars), inc_step) # copy weights from the parameter server to the local model sync_var_list = [ v1.assign(v2) for v1, v2 in zip(pi.var_list, self.network.var_list) ] self.sync = tf.group(*sync_var_list) # initialize extras self.summary_writer = None self.local_steps = 0
def __init__(self, env, task, visualise, unsupType, envWrap=False, designHead='universe', noReward=False): """ An implementation of the A3C algorithm that is reasonably well-tuned for the VNC environments. Below, we will have a modest amount of complexity due to the way TensorFlow handles data parallelism. But overall, we'll define the model, specify its inputs, and describe how the policy gradients step should be computed. """ self.task = task self.unsup = unsupType is not None self.envWrap = envWrap self.env = env predictor = None numaction = env.action_space.n worker_device = "/job:worker/task:{}/cpu:0".format(task) with tf.device( tf.train.replica_device_setter(1, worker_device=worker_device)): with tf.variable_scope("global"): self.network = LSTMPolicy(env.observation_space.shape, numaction, designHead) self.global_step = tf.get_variable( "global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False) if self.unsup: with tf.variable_scope("predictor"): if 'state' in unsupType: self.ap_network = StatePredictor( env.observation_space.shape, numaction, designHead, unsupType) else: self.ap_network = StateActionPredictor( env.observation_space.shape, numaction, designHead) with tf.device(worker_device): with tf.variable_scope("local"): self.local_network = pi = LSTMPolicy( env.observation_space.shape, numaction, designHead) pi.global_step = self.global_step if self.unsup: with tf.variable_scope("predictor"): if 'state' in unsupType: self.local_ap_network = predictor = StatePredictor( env.observation_space.shape, numaction, designHead, unsupType) else: self.local_ap_network = predictor = StateActionPredictor( env.observation_space.shape, numaction, designHead) # Computing a3c loss: https://arxiv.org/abs/1506.02438 self.ac = tf.placeholder(tf.float32, [None, numaction], name="ac") self.adv = tf.placeholder(tf.float32, [None], name="adv") self.r = tf.placeholder(tf.float32, [None], name="r") log_prob_tf = tf.nn.log_softmax(pi.logits) prob_tf = tf.nn.softmax(pi.logits) # 1) the "policy gradients" loss: its derivative is precisely the policy gradient # notice that self.ac is a placeholder that is provided externally. # adv will contain the advantages, as calculated in process_rollout pi_loss = -tf.reduce_mean( tf.reduce_sum(log_prob_tf * self.ac, 1) * self.adv) # Eq (19) # 2) loss of value function: l2_loss = (x-y)^2/2 vf_loss = 0.5 * tf.reduce_mean( tf.square(pi.vf - self.r)) # Eq (28) # 3) entropy to ensure randomness entropy = -tf.reduce_mean(tf.reduce_sum(prob_tf * log_prob_tf, 1)) # final a3c loss: lr of critic is half of actor self.loss = pi_loss + 0.5 * vf_loss - entropy * constants[ 'ENTROPY_BETA'] # compute gradients grads = tf.gradients( self.loss * 20.0, pi.var_list ) # batchsize=20. Factored out to make hyperparams not depend on it. # computing predictor loss if self.unsup: if 'state' in unsupType: self.predloss = constants[ 'PREDICTION_LR_SCALE'] * predictor.forwardloss else: self.predloss = constants['PREDICTION_LR_SCALE'] * ( predictor.invloss * (1 - constants['FORWARD_LOSS_WT']) + predictor.forwardloss * constants['FORWARD_LOSS_WT']) predgrads = tf.gradients( self.predloss * 20.0, predictor.var_list ) # batchsize=20. Factored out to make hyperparams not depend on it. # do not backprop to policy if constants['POLICY_NO_BACKPROP_STEPS'] > 0: grads = [ tf.scalar_mul( tf.to_float( tf.greater( self.global_step, constants['POLICY_NO_BACKPROP_STEPS'])), grads_i) for grads_i in grads ] self.runner = RunnerThread(env, pi, constants['ROLLOUT_MAXLEN'], visualise, predictor, envWrap, noReward) # storing summaries bs = tf.to_float(tf.shape(pi.x)[0]) if use_tf12_api: tf.summary.scalar("model/policy_loss", pi_loss) tf.summary.scalar("model/value_loss", vf_loss) tf.summary.scalar("model/entropy", entropy) tf.summary.image("model/state", pi.x) # max_outputs=10 tf.summary.scalar("model/grad_global_norm", tf.global_norm(grads)) tf.summary.scalar("model/var_global_norm", tf.global_norm(pi.var_list)) if self.unsup: tf.summary.scalar("model/predloss", self.predloss) if 'action' in unsupType: tf.summary.scalar("model/inv_loss", predictor.invloss) tf.summary.scalar("model/forward_loss", predictor.forwardloss) tf.summary.scalar("model/predgrad_global_norm", tf.global_norm(predgrads)) tf.summary.scalar("model/predvar_global_norm", tf.global_norm(predictor.var_list)) self.summary_op = tf.summary.merge_all() else: tf.scalar_summary("model/policy_loss", pi_loss) tf.scalar_summary("model/value_loss", vf_loss) tf.scalar_summary("model/entropy", entropy) tf.image_summary("model/state", pi.x) tf.scalar_summary("model/grad_global_norm", tf.global_norm(grads)) tf.scalar_summary("model/var_global_norm", tf.global_norm(pi.var_list)) if self.unsup: tf.scalar_summary("model/predloss", self.predloss) if 'action' in unsupType: tf.scalar_summary("model/inv_loss", predictor.invloss) tf.scalar_summary("model/forward_loss", predictor.forwardloss) tf.scalar_summary("model/predgrad_global_norm", tf.global_norm(predgrads)) tf.scalar_summary("model/predvar_global_norm", tf.global_norm(predictor.var_list)) self.summary_op = tf.merge_all_summaries() # clip gradients grads, _ = tf.clip_by_global_norm(grads, constants['GRAD_NORM_CLIP']) grads_and_vars = list(zip(grads, self.network.var_list)) if self.unsup: predgrads, _ = tf.clip_by_global_norm( predgrads, constants['GRAD_NORM_CLIP']) pred_grads_and_vars = list( zip(predgrads, self.ap_network.var_list)) grads_and_vars = grads_and_vars + pred_grads_and_vars # update global step by batch size inc_step = self.global_step.assign_add(tf.shape(pi.x)[0]) # each worker has a different set of adam optimizer parameters # TODO: make optimizer global shared, if needed print("Optimizer: ADAM with lr: %f" % (constants['LEARNING_RATE'])) print("Input observation shape: ", env.observation_space.shape) opt = tf.train.AdamOptimizer(constants['LEARNING_RATE']) self.train_op = tf.group(opt.apply_gradients(grads_and_vars), inc_step) # copy weights from the parameter server to the local model sync_var_list = [ v1.assign(v2) for v1, v2 in zip(pi.var_list, self.network.var_list) ] if self.unsup: sync_var_list += [ v1.assign(v2) for v1, v2 in zip(predictor.var_list, self.ap_network.var_list) ] self.sync = tf.group(*sync_var_list) # initialize extras self.summary_writer = None self.local_steps = 0
def __init__(self, envs, workerid, target_task): """ An implementation of the A3C algorithm that is reasonably well-tuned for the VNC environments. Below, we will have a modest amount of complexity due to the way TensorFlow handles data parallelism. But overall, we'll define the model, specify its inputs, and describe how the policy gradients step should be computed. """ self.env = envs self.num_tasks = num_tasks = 2 # only suitable when number of tasks equal to 2. self.target_task = target_task self.aux_tasks_id = int(1 - target_task) self.workerid = workerid self.network = [None] * self.num_tasks self.local_logitProjnet = [None] * self.num_tasks self.local_network = [None] * self.num_tasks self.global_step = [None] * self.num_tasks self.logitProjnet = [None] * self.num_tasks self.T1 = [100000000, 3000000] # [400, 5000] #[4000000, 6000000] self.T2 = [100000000, 4000000] # [400, 5000] #[4000000, 6000000] pi = [None] * self.num_tasks worker_device = "/job:worker/task:{}/cpu:0".format(workerid) with tf.device( tf.train.replica_device_setter(1, worker_device=worker_device)): with tf.variable_scope("global" + str(0)): # Pong self.network[0] = LSTMPolicy(envs[0].observation_space.shape, envs[0].action_space.n) self.global_step[0] = tf.get_variable( "global_step", [], tf.int32, initializer=tf.zeros_initializer, trainable=False) with tf.variable_scope("global" + str(1)): #bowling self.network[1] = LSTMPolicy(envs[1].observation_space.shape, envs[1].action_space.n) self.global_step[1] = tf.get_variable( "global_step", [], tf.int32, initializer=tf.zeros_initializer, trainable=False) with tf.variable_scope( "globallogits0"): # network for projection logits. self.logitProjnet[0] = logitsProj(envs[0].action_space.n) with tf.variable_scope( "globallogits1"): # network for projection logits. self.logitProjnet[1] = logitsProj(envs[1].action_space.n) with tf.device(worker_device): with tf.variable_scope("local" + str(0)): self.local_network[0] = pi[0] = LSTMPolicy( envs[0].observation_space.shape, envs[0].action_space.n) pi[0].global_step = self.global_step[0] with tf.variable_scope("local" + str(1)): self.local_network[1] = pi[1] = LSTMPolicy( envs[1].observation_space.shape, envs[1].action_space.n) pi[1].global_step = self.global_step[1] with tf.variable_scope("local" + "logits0"): self.local_logitProjnet[0] = logitsProj(envs[0].action_space.n) with tf.variable_scope("local" + "logits1"): self.local_logitProjnet[1] = logitsProj(envs[1].action_space.n) self.ac = [None] * num_tasks self.adv = [None] * num_tasks self.r = [None] * num_tasks log_prob_tf = [None] * num_tasks prob_tf = [None] * num_tasks pi_loss = [None] * num_tasks vf_loss = [None] * num_tasks entropy = [None] * num_tasks bs = [None] * num_tasks self.loss = [None] * num_tasks self.runner = [None] * num_tasks grads = [None] * num_tasks self.summary_op = [[None, None] for i in np.arange(num_tasks)] self.sync = [None] * num_tasks grads_and_vars = [None] * num_tasks self.inc_step = [None] * num_tasks opt = [None] * num_tasks self.train_op = [None] * num_tasks self.target_logits = [None] * num_tasks soft_p_temperature = [None] * num_tasks soft_t_temperature = [None] * num_tasks self.KD_trainop = [None] * num_tasks kl_loss = [None] * num_tasks grads_kd = [None] * num_tasks grads_and_vars_kd = [None] * num_tasks optkd = [None] * num_tasks self.sync_logits = [None] * num_tasks self.logits_stu = [None] * num_tasks soft_student_logits = [None] * num_tasks soft_teacher_logits = [None] * num_tasks self.proj_loss = [None] * num_tasks grad_logproj = [None] * num_tasks grads_and_vars_logproj = [None] * num_tasks optlgproj = [None] * num_tasks self.lgproj_trainop = [None] * num_tasks self.summary_op_proj = [None] * num_tasks for ii in np.arange(num_tasks): # start to build loss for target network self.ac[ii] = tf.placeholder(tf.float32, [None, envs[ii].action_space.n], name="ac" + str(ii)) self.adv[ii] = tf.placeholder(tf.float32, [None], name="adv" + str(ii)) self.r[ii] = tf.placeholder(tf.float32, [None], name="r" + str(ii)) log_prob_tf[ii] = tf.nn.log_softmax(pi[ii].logits) prob_tf[ii] = tf.nn.softmax(pi[ii].logits) # the "policy gradients" loss: its derivative is precisely the policy gradient # notice that self.ac is a placeholder that is provided externally. # ac will contain the advantages, as calculated in process_rollout pi_loss[ii] = -tf.reduce_sum( tf.reduce_sum(log_prob_tf[ii] * self.ac[ii], [1]) * self.adv[ii]) # loss of value function vf_loss[ii] = 0.5 * tf.reduce_sum( tf.square(pi[ii].vf - self.r[ii])) entropy[ii] = -tf.reduce_sum(prob_tf[ii] * log_prob_tf[ii]) bs[ii] = tf.to_float(tf.shape(pi[ii].x)[0]) self.loss[ ii] = pi_loss[ii] + 0.5 * vf_loss[ii] - entropy[ii] * 0.01 # 20 represents the number of "local steps": the number of timesteps # we run the policy before we update the parameters. # The larger local steps is, the lower is the variance in our policy gradients estimate # on the one hand; but on the other hand, we get less frequent parameter updates, which # slows down learning. In this code, we found that making local steps be much # smaller than 20 makes the algorithm more difficult to tune and to get to work. # name = "worker"+str(workerid)+"task"+str(ii) name = "task" + str(ii) self.runner[ii] = RunnerThread(envs[ii], pi[ii], 20, name) grads[ii] = tf.gradients(self.loss[ii], pi[ii].var_list) summaries1 = list() # summary when it's target tasks summaries1.append( tf.scalar_summary("model/policy_loss" + str(ii), pi_loss[ii] / bs[ii])) summaries1.append( tf.scalar_summary("model/value_loss" + str(ii), vf_loss[ii] / bs[ii])) summaries1.append( tf.scalar_summary("model/entropy" + str(ii), entropy[ii] / bs[ii])) summaries1.append( tf.image_summary("model/state" + str(ii), pi[ii].x)) summaries1.append( tf.scalar_summary("model/grad_global_norm" + str(ii), tf.global_norm(grads[ii]))) summaries1.append( tf.scalar_summary("model/var_global_norm" + str(ii), tf.global_norm(pi[ii].var_list))) summaries1.append( tf.histogram_summary("model/action_weight" + str(ii), prob_tf[ii])) summaries2 = list() # summary when it's aux tasks. summaries2.append( tf.histogram_summary("model/action_weight" + str(ii), prob_tf[ii])) summaries2.append( tf.scalar_summary("model/entropy" + str(ii), entropy[ii] / bs[ii])) self.summary_op[ii][0] = tf.merge_summary(summaries1) self.summary_op[ii][1] = tf.merge_summary(summaries2) grads[ii], _ = tf.clip_by_global_norm(grads[ii], 40.0) # self.sync = [None] * self.num_tasks zipvars_lp = zip(pi[ii].var_list, self.network[ii].var_list) self.sync[ii] = tf.group( *[v1.assign(v2) for v1, v2 in zipvars_lp]) grads_and_vars[ii] = list( zip(grads[ii], self.network[ii].var_list)) self.inc_step[ii] = self.global_step[ii].assign_add( tf.shape(pi[ii].x)[0]) # each worker has a different set of adam optimizer parameters opt[ii] = tf.train.AdamOptimizer(1e-4) self.train_op[ii] = tf.group( opt[ii].apply_gradients(grads_and_vars[ii]), self.inc_step[ii]) # knowledge distillation self.target_logits[ii] = tf.placeholder( tf.float32, [None, envs[ii].action_space.n], name="target_logits") # logits from teacher Tao = 1.0 # temperature used for distillation. soft_p_temperature[ii] = tf.nn.softmax( pi[ii].logits_fordistill) soft_t_temperature[ii] = tf.nn.softmax( tf.truediv(self.target_logits[ii], Tao)) kl_loss[ii] = tf.reduce_mean( tf.reduce_sum( soft_t_temperature[ii] * tf.log(1e-10 + tf.truediv( soft_t_temperature[ii], soft_p_temperature[ii])), 1)) grads_kd[ii] = tf.gradients(kl_loss[ii], pi[ii].var_list) grads_kd[ii], _ = tf.clip_by_global_norm(grads_kd[ii], 40.0) grads_and_vars_kd[ii] = list( zip(grads_kd[ii], self.network[ii].var_list)) optkd[ii] = tf.train.AdamOptimizer(1e-4) self.KD_trainop[ii] = optkd[ii].apply_gradients( grads_and_vars_kd[ii]) 'learning logits projection' zipvars_lp = zip(self.local_logitProjnet[ii].var_list, self.logitProjnet[ii].var_list) self.sync_logits[ii] = tf.group( *[v1.assign(v2) for v1, v2 in zipvars_lp]) # soft_student_logits = tf.nn.softmax(pi[target_task].logits) self.logits_stu[ii] = tf.placeholder( tf.float32, [None, envs[ii].action_space.n]) soft_student_logits[ii] = tf.nn.softmax(self.logits_stu[ii]) soft_teacher_logits[ii] = tf.nn.softmax( self.local_logitProjnet[ii].logits_out) self.proj_loss[ii] = proj_loss = tf.reduce_mean( tf.reduce_sum( soft_teacher_logits[ii] * tf.log(1e-10 + tf.truediv( soft_teacher_logits[ii], soft_student_logits[ii])), 1)) # target task --> student grad_logproj[ii] = tf.gradients( proj_loss, self.local_logitProjnet[ii].var_list) grad_logproj[ii], _ = tf.clip_by_global_norm( grad_logproj[ii], 40.0) grads_and_vars_logproj[ii] = list( zip(grad_logproj[ii], self.logitProjnet[ii].var_list)) optlgproj[ii] = tf.train.AdamOptimizer(1e-4) self.lgproj_trainop[ii] = optlgproj[ii].apply_gradients( grads_and_vars_logproj[ii]) self.summary_op_proj[ii] = tf.scalar_summary( "model/proj_loss" + str(ii), self.proj_loss[ii]) self.summary_writer = None self.local_steps = 0
def __init__(self, env, task, visualise, learning_rate, meta, remotes, num_trials, total_num_steps): """ (original comment) An implementation of the A3C algorithm that is reasonably well-tuned for the VNC environments. Below, we will have a modest amount of complexity due to the way TensorFlow handles data parallelism. But overall, we'll define the model, specify its inputs, and describe how the policy gradients step should be computed. """ self.env = env self.task = task self.remotes = remotes self.learning_rate = learning_rate self.num_trials = num_trials num_local_steps = 5 # t_max in the A3C paper: number of steps in the rollouts isBanditEnvironment = "Bandit" in env.env.spec.id # boolean variable, is True if the environment is a Bandit environment if isBanditEnvironment: if 'Two' in env.env.spec.id: reward_range = 2 elif 'Eleven' in env.env.spec.id: reward_range = 13 worker_device = "/job:worker/task:{}/cpu:0".format(task) with tf.device( tf.train.replica_device_setter(1, worker_device=worker_device)): with tf.variable_scope("global"): if isBanditEnvironment: self.network = LSTMPolicyBandit( env.observation_space.shape, env.action_space.n, reward_range) else: self.network = LSTMPolicy(env.observation_space.shape, env.action_space.n) self.global_step = tf.get_variable( "global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False ) # Cree un compteur global et l'initialise a zero, saud si on reprend un training existant with tf.device(worker_device): with tf.variable_scope("local"): if isBanditEnvironment: self.local_network = pi = LSTMPolicyBandit( env.observation_space.shape, env.action_space.n, reward_range) else: self.local_network = pi = LSTMPolicy( env.observation_space.shape, env.action_space.n) pi.global_step = self.global_step self.ac = tf.placeholder( tf.float32, [None, env.action_space.n], name="ac" ) # action, will contain the list of the action vectors at each step of the rollout ; placeholder called by A3C's process function in trainer.process(sess) in worker.py self.adv = tf.placeholder( tf.float32, [None], name="adv" ) # advantage, wil contain the list of the advantages at each step of the rollout ; placeholder called by A3C's process function in trainer.process(sess) in worker.py self.return_ = tf.placeholder( tf.float32, [None], name="return_" ) # return, wil contain the return obtained after visiting each of steps in the rollout ; placeholder called by A3C's process function in trainer.process(sess) in worker.py log_prob_tf = tf.nn.log_softmax( pi.logits) # the log probability of each action log(\pi(a|s)) prob_tf = tf.nn.softmax( pi.logits) # the probability of each action \pi(a|s) # (original comment) the "policy gradients" loss: its derivative is precisely the policy gradient. Notice that self.ac is a placeholder that is provided externally. # pi_loss = -tf.reduce_sum( tf.reduce_sum(log_prob_tf * self.ac, [1]) * self.adv) # loss of value function vf_loss = 0.5 * tf.reduce_sum( tf.square(pi.vf - self.return_) ) # why not taking the sum of the squared values of self.adv directly ? entropy = -tf.reduce_sum(prob_tf * log_prob_tf) beta_entropy = (float(1) / total_num_steps) * tf.cast( tf.constant(total_num_steps) - self.global_step, tf.float32) bs = tf.to_float(tf.shape( pi.x)[0]) # bs = batch size = number of steps in the rollout self.loss = pi_loss + (0.05 * vf_loss) - ( beta_entropy * entropy ) # why 0.5 when we already put 0.5 in the definition of vf_loss ? # (Original comment) # num_local_steps represents the number of timesteps we run the policy before we update the parameters. # The larger local steps is, the lower is the variance in our policy gradients estimate on the one hand; but on the other hand, we get less frequent parameter updates, which slows down learning. # In this code, we found that making local steps be much smaller than 20 makes the algorithm more difficult to tune and to get to work. # (My comment): # The original A3C paper uses num_local_step = 5 on Atari games, but it uses an action repeat of 4 (not present here), so the network is updates every 20 frames, as in the original universe-starter-agent self.runner = RunnerThread( env, pi, num_local_steps, visualise, meta, task, remotes, num_trials ) # Objet de la classe RunnerThread definie plus haut. 20 is the maximum number of steps in a partial rollout. # computes the gradient of the loss function: grads = tf.gradients(self.loss, pi.var_list) # tensorboard: if use_tf12_api: tf.summary.scalar("model/policy_loss", pi_loss / bs) tf.summary.scalar("model/value_loss", vf_loss / bs) tf.summary.scalar("model/entropy", entropy / bs) tf.summary.scalar("model/grad_global_norm", tf.global_norm(grads)) tf.summary.scalar("model/var_global_norm", tf.global_norm(pi.var_list)) if not isBanditEnvironment: tf.summary.image("model/state", pi.x) self.summary_op = tf.summary.merge_all() else: tf.scalar_summary("model/policy_loss", pi_loss / bs) tf.scalar_summary("model/value_loss", vf_loss / bs) tf.scalar_summary("model/entropy", entropy / bs) tf.scalar_summary("model/grad_global_norm", tf.global_norm(grads)) tf.scalar_summary("model/var_global_norm", tf.global_norm(pi.var_list)) if not isBanditEnvironment: tf.image_summary("model/state", pi.x) self.summary_op = tf.merge_all_summaries() # Create a list of (gradient, variable) pairs to feed into the Adam Optimizer (each variable will then be updated according to the paired gradient) grads, _ = tf.clip_by_global_norm(grads, 40.0) # ? grads_and_vars = list(zip(grads, self.network.var_list)) # copy weights from the parameter server to the local model self.sync = tf.group( *[ v1.assign(v2) for v1, v2 in zip(pi.var_list, self.network.var_list) ] ) # remplace les valeurs de pi.var_list par ceux de self.network.var_list (execute dans la function "process") # updates the global counter: adds (and assign) tf.shape(pi.x)[0] to the value of the variable self.global_step (initialise a zero), and inc_step takes this updtated value: inc_step = self.global_step.assign_add( tf.shape(pi.x)[0] ) # on incremente le compteur global du nombre de steps contenus dans le rollout (= batch size) ; appele par la fonction process self.inc_step = inc_step # so that we can call it directly from the inc_global_step method # each worker has a different set of adam optimizer parameters opt = tf.train.AdamOptimizer( self.learning_rate ) # the default learning rate is 1e-4. This value with the argument -lr <new_value> self.train_op = tf.group( opt.apply_gradients(grads_and_vars), inc_step ) # tf.group creates an op that groups multiple operations (here, two operations) self.summary_writer = None self.local_steps = 0
def __init__(self, env, task): self.env = env self.task = task worker_device = "/job:worker/task:{}/cpu:0".format(task) with tf.device( tf.train.replica_device_setter(1, worker_device=worker_device)): with tf.variable_scope("global"): self.network = LSTMPolicy(env.observation_space.shape, env.action_space.n) self.global_step = tf.get_variable( "global_step", [], tf.int32, initializer=tf.zeros_initializer, trainable=False) with tf.device(worker_device): with tf.variable_scope("local"): self.local_network = pi = LSTMPolicy( env.observation_space.shape, env.action_space.n) pi.global_step = self.global_step self.ac = tf.placeholder(tf.float32, [None, env.action_space.n], name="ac") self.adv = tf.placeholder(tf.float32, [None], name="adv") self.r = tf.placeholder(tf.float32, [None], name="r") log_prob_tf = tf.nn.log_softmax(pi.logits) prob_tf = tf.nn.softmax(pi.logits) pi_loss = -tf.reduce_sum( tf.reduce_sum(log_prob_tf * self.ac, [1]) * self.adv) vf_loss = 0.5 * tf.reduce_sum(tf.square(pi.vf - self.r)) entropy = -tf.reduce_sum(prob_tf * log_prob_tf) bs = tf.to_float(tf.shape(pi.x)[0]) self.loss = pi_loss + 0.5 * vf_loss - entropy * 0.01 self.runner = RunnerThread(env, pi, 20) grads = tf.gradients(self.loss, pi.var_list) tf.scalar_summary("model/policy_loss", pi_loss / bs) tf.scalar_summary("model/value_loss", vf_loss / bs) tf.scalar_summary("model/entropy", entropy / bs) tf.image_summary("model/state", pi.x) tf.scalar_summary("model/grad_global_norm", tf.global_norm(grads)) tf.scalar_summary("model/var_global_norm", tf.global_norm(pi.var_list)) self.summary_op = tf.merge_all_summaries() grads, _ = tf.clip_by_global_norm(grads, 40.0) self.sync = tf.group(*[ v1.assign(v2) for v1, v2 in zip(pi.var_list, self.network.var_list) ]) grads_and_vars = list(zip(grads, self.network.var_list)) inc_step = self.global_step.assign_add(tf.shape(pi.x)[0]) opt = tf.train.AdamOptimizer(1e-4) self.train_op = tf.group(opt.apply_gradients(grads_and_vars), inc_step) self.summary_writer = None self.local_steps = 0
def inference(args): indir = os.path.join(args.log_dir, 'train') outdir = os.path.join( args.log_dir, 'inference') if args.out_dir is None else args.out_dir with open(indir + "/checkpoint", "r") as f: first_line = f.readline().strip() print("first_line is : {}".format(first_line)) ckpt = first_line.split(' ')[-1].split('/')[-1][:-1] ckpt = ckpt.split('-')[-1] ckpt = indir + '/model.ckpt-' + ckpt print("ckpt: {}".format(ckpt)) # define environment env = create_icegame_env(outdir, args.env_id) num_actions = env.action_space.n with tf.device("/cpu:0"): # define policy network with tf.variable_scope("global"): policy = LSTMPolicy(env.observation_space.shape, num_actions) policy.global_step = tf.get_variable( "global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False) # Variable names that start with "local" are not saved in checkpoints. variables_to_restore = [ v for v in tf.global_variables() if not v.name.startswith("local") ] init_all_op = tf.global_variables_initializer() saver = FastSaver(variables_to_restore) # print trainable variables var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) logger.info('Trainable vars:') for v in var_list: logger.info(' {} {}'.format(v.name, v.get_shape())) logger.info("Restored the trained model.") # summary of rewards action_writers = [] summary_writer = tf.summary.FileWriter(outdir) for act_idx in range(num_actions): action_writers.append( tf.summary.FileWriter( os.path.join(outdir, "action_{}".format(act_idx)))) logger.info("Inference events directory: %s", outdir) config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) with tf.Session() as sess: logger.info("Initializing all parameters.") sess.run(init_all_op) logger.info("Restoring trainable global parameters.") saver.restore(sess, ckpt) logger.info("Restored model was trained for %.2fM global steps", sess.run(policy.global_step) / 1000000.) last_features = policy.get_initial_features() # reset lstm memory length = 0 rewards = 0 loopsizes = [] # All Episodes records for ep in range(args.num_episodes): """TODO: policy sampling strategy random, greedy and sampled policy. """ last_state = env.reset() # Episode records # running policy while True: fetched = policy.act_inference(last_state, *last_features) prob_action, action, value_, features = fetched[ 0], fetched[1], fetched[2], fetched[3:] #TODO: policy sampling strategy # Greedy stepAct = action.argmax() state, reward, terminal, info = env.step(stepAct) # update stats length += 1 rewards += reward last_state = state last_features = features """TODO: Resonable Statistics are necessary """ if info: loopsize = info["Loop Size"] looparea = info["Loop Area"] # store summary summary = tf.Summary() summary.value.add(tag='ep_{}/reward'.format(ep), simple_value=reward) summary.value.add(tag='ep_{}/netreward'.format(ep), simple_value=rewards) summary.value.add(tag='ep_{}/value'.format(ep), simple_value=float(value_[0])) if info: summary.value.add(tag='ep_{}/loop_size'.format(ep), simple_value=loopsize) summary.value.add(tag='ep_{}/loop_area'.format(ep), simple_value=looparea) loopsizes.append(loopsize) summary_writer.add_summary(summary, length) summary_writer.flush() summary = tf.Summary() for ac_id in range(num_actions): summary.value.add(tag='ep_{}/a_{}'.format(ep, ac_id), simple_value=float( prob_action[ac_id])) action_writers[ac_id].add_summary(summary, length) action_writers[ac_id].flush() """TODO: 1. Need more concrete idea for playing the game when interfering. 2. Save these values for post processing. """ if terminal: #if length >= timestep_limit: # last_state, _, _, _ = env.reset() last_features = policy.get_initial_features( ) # reset lstm memory print( "Episode finished. Sum of rewards: %.2f. Length: %d." % (rewards, length)) length = 0 rewards = 0 break logger.info('Finished %d true episodes.', args.num_episodes) # Count loop topology unique, counts = np.unique(loopsizes, return_counts=True) loopstatistics = dict(zip(unique, counts)) print(loopstatistics) env.close()
def __init__(self, env, task, visualise, sensor_pb, vision_pb): """ An implementation of the A3C algorithm that is reasonably well-tuned for the VNC environments. Below, we will have a modest amount of complexity due to the way TensorFlow handles data parallelism. But overall, we'll define the model, specify its inputs, and describe how the policy gradients step should be computed. """ self.env = env self.task = task self.visualise = visualise obs_shape = env.observation_shape worker_device = '/job:worker/task:{}/cpu:0'.format(task) with tf.device( tf.train.replica_device_setter(1, worker_device=worker_device)): with tf.variable_scope('global'): self.network = LSTMPolicy(obs_shape, env.action_space) self.global_step = tf.get_variable( 'global_step', [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False) with tf.device(worker_device): with tf.variable_scope("local"): self.local_network = pi = LSTMPolicy(obs_shape, env.action_space) pi.global_step = self.global_step self.ac = tf.placeholder(tf.float32, [None, env.action_space], name="ac") self.adv = tf.placeholder(tf.float32, [None], name="adv") self.r = tf.placeholder(tf.float32, [None], name="r") with gfile.FastGFile(sensor_pb, 'rb') as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) self.input_sns, self.output_sns = tf.import_graph_def( graph_def, return_elements=['lstm_1_input:0', 'output_node0:0']) with gfile.FastGFile(vision_pb, 'rb') as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) self.input_img, self.output_img = tf.import_graph_def( graph_def, return_elements=['input_1:0', 'output_node0:0']) env.sensor_agent.input_tf = self.input_sns env.sensor_agent.output_tf = self.output_sns env.vision_agent.input_tf = self.input_img env.vision_agent.output_tf = self.output_img log_prob_tf = tf.nn.log_softmax(pi.logits) prob_tf = tf.nn.softmax(pi.logits) # the "policy gradients" loss: its derivative is precisely the # policy gradient notice that self.ac is a placeholder that is # provided externally. adv will contain the advantages, as # calculated in process_rollout pi_loss = -tf.reduce_sum( tf.reduce_sum(log_prob_tf * self.ac, [1]) * self.adv) # loss of value function vf_loss = 0.5 * tf.reduce_sum(tf.square(pi.vf - self.r)) entropy = -tf.reduce_sum(prob_tf * log_prob_tf) bs = tf.to_float(tf.shape(pi.x)[0]) self.loss = pi_loss + 0.5 * vf_loss - entropy * 0.01 grads = tf.gradients(self.loss, pi.var_list) tf.summary.scalar("model/policy_loss", pi_loss / bs) tf.summary.scalar("model/value_loss", vf_loss / bs) tf.summary.scalar("model/entropy", entropy / bs) tf.summary.scalar("model/grad_global_norm", tf.global_norm(grads)) tf.summary.scalar("model/var_global_norm", tf.global_norm(pi.var_list)) self.summary_op = tf.summary.merge_all() grads, _ = tf.clip_by_global_norm(grads, 40.0) # copy weights from the parameter server to the local model self.sync = tf.group(*[ v1.assign(v2) for v1, v2 in zip(pi.var_list, self.network.var_list) ]) grads_and_vars = list(zip(grads, self.network.var_list)) inc_step = self.global_step.assign_add(tf.shape(pi.x)[0]) # each worker has a different set of adam optimizer parameters opt = tf.train.AdamOptimizer(1e-4) self.train_op = tf.group(opt.apply_gradients(grads_and_vars), inc_step) self.summary_writer = None self.local_steps = 0
def __init__(self, env, env_id, task): """ An implementation of the A3C algorithm that is reasonably well-tuned for the VNC environments. Below, we will have a modest amount of complexity due to the way TensorFlow handles data parallelism. But overall, we'll define the model, specify its inputs, and describe how the policy gradients step should be computed. """ self.env = env self.task = task self.env_id = env_id from config import if_learning_v self.if_learning_v = if_learning_v from config import project, mode if (project is 'f') and (mode is 'on_line'): self.log_thread = True else: '''only log if the task is on zero and cluster is the main cluster''' if (self.task % config.num_workers_global == 0) and (config.cluster_current == config.cluster_main): self.log_thread = True else: self.log_thread = False worker_device = "/job:worker/task:{}/cpu:0".format(task) with tf.device( tf.train.replica_device_setter(1, worker_device=worker_device)): with tf.variable_scope("global"): self.network = LSTMPolicy(env.observation_space.shape, env.action_space.n, self.env_id) self.global_step = tf.get_variable( "global_step", [], tf.int32, initializer=tf.zeros_initializer(), trainable=False) with tf.device(worker_device): with tf.variable_scope("local"): self.local_network = pi = LSTMPolicy( env.observation_space.shape, env.action_space.n, self.env_id) pi.global_step = self.global_step # self.env_id = 'PongDeterministic-v3' self.ac = tf.placeholder(tf.float32, [None, env.action_space.n], name="ac") self.adv = tf.placeholder(tf.float32, [None], name="adv") self.r = tf.placeholder(tf.float32, [None], name="r") self.step_forward = tf.placeholder(tf.int32, [None], name="step_forward") if self.if_learning_v: self.v_lable = tf.placeholder(tf.float32, [None], name="v_lable") log_prob_tf = tf.nn.log_softmax(pi.logits) prob_tf = tf.nn.softmax(pi.logits) # the "policy gradients" loss: its derivative is precisely the policy gradients # notice that self.ac is a placeholder that is provided externally. # ac will contain the advantages, as calculated in process_rollout pi_loss = -tf.reduce_sum( tf.reduce_sum(log_prob_tf * self.ac, [1]) * self.adv) # loss of value function vf_loss = 0.5 * tf.reduce_sum(tf.square(pi.vf - self.r)) # -entropy loss entropy = -tf.reduce_sum(prob_tf * log_prob_tf) # v loss if self.if_learning_v: v_loss = 0.5 * tf.reduce_sum(tf.square(pi.v - self.v_lable)) bs = tf.to_float(tf.shape(pi.x)[0]) if self.if_learning_v: self.loss = pi_loss + 0.5 * vf_loss - entropy * 0.01 + 0.5 * v_loss else: self.loss = pi_loss + 0.5 * vf_loss - entropy * 0.01 # config.update_step represents the number of "local steps": the number of timesteps # we run the policy before we update the parameters. # The larger local steps is, the lower is the variance in our policy gradients estimate # on the one hand; but on the other hand, we get less frequent parameter updates, which # slows down learning. In this code, we found that making local steps be much # smaller than 20 makes the algorithm more difficult to tune and to get to work. self.runner = RunnerThread(env, env_id, pi, config.update_step, self.log_thread) grads = tf.gradients(self.loss, pi.var_list) tf.summary.scalar(self.env_id + "/model/policy_loss", pi_loss / bs) tf.summary.scalar(self.env_id + "/model/value_loss", vf_loss / bs) tf.summary.scalar(self.env_id + "/model/entropy", entropy / bs) tf.summary.scalar(self.env_id + "/model/grad_global_norm", tf.global_norm(grads)) tf.summary.scalar(self.env_id + "/model/var_global_norm", tf.global_norm(pi.var_list)) if self.if_learning_v: tf.summary.scalar(self.env_id + "/model/v_loss", v_loss / bs) self.summary_op = tf.summary.merge_all() grads, _ = tf.clip_by_global_norm(grads, 40.0) # copy weights from the parameter server to the local model self.sync = tf.group(*[ v1.assign(v2) for v1, v2 in zip(pi.var_list, self.network.var_list) ]) grads_and_vars = list(zip(grads, self.network.var_list)) inc_step = self.global_step.assign_add( tf.shape(self.step_forward)[0]) # each worker has a different set of adam optimizer parameters opt = tf.train.AdamOptimizer(1e-4) self.train_op = tf.group(opt.apply_gradients(grads_and_vars), inc_step) self.summary_writer = None self.local_steps = 0
def inference(args): indir = os.path.join(args.log_dir, 'train') outdir = os.path.join(args.log_dir, 'player') if args.out_dir is None else args.out_dir with open(indir + "/checkpoint", "r") as f: first_line = f.readline().strip() print ("first_line is : {}".format(first_line)) ckpt = first_line.split(' ')[-1].split('/')[-1][:-1] ckpt = ckpt.split('-')[-1] ckpt = indir + '/model.ckpt-' + ckpt print ("ckpt: {}".format(ckpt)) # define environment env = create_icegame_env(outdir, args.env_id) num_actions = env.action_space.n with tf.device("/cpu:0"): # define policy network with tf.variable_scope("global"): policy = LSTMPolicy(env.observation_space.shape, num_actions) policy.global_step = tf.get_variable("global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False) # Variable names that start with "local" are not saved in checkpoints. variables_to_restore = [v for v in tf.global_variables() if not v.name.startswith("local")] init_all_op = tf.global_variables_initializer() saver = FastSaver(variables_to_restore) # print trainable variables var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) logger.info('Trainable vars:') for v in var_list: logger.info(' {} {}'.format(v.name, v.get_shape())) logger.info("Restored the trained model.") # summary of rewards action_writers = [] summary_writer = tf.summary.FileWriter(outdir) for act_idx in range(num_actions): action_writers.append(tf.summary.FileWriter( os.path.join(outdir, "action_{}".format(act_idx)) )) logger.info("Inference events directory: %s", outdir) config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) with tf.Session() as sess: logger.info("Initializing all parameters.") sess.run(init_all_op) logger.info("Restoring trainable global parameters.") saver.restore(sess, ckpt) logger.info("Restored model was trained for %.2fM global steps", sess.run(policy.global_step)/1000000.) last_features = policy.get_initial_features() # reset lstm memory length = 0 rewards = 0 # For plotting plt.ion() fig = plt.figure(num=None, figsize=(8, 8), dpi=92, facecolor='w', edgecolor='k') gs1 = gridspec.GridSpec(3, 3) gs1.update(left=0.05, right=0.85, wspace=0.15) ax1 = plt.subplot(gs1[:-1, :]) ax2 = plt.subplot(gs1[-1, :-1]) ax3 = plt.subplot(gs1[-1, -1]) ax1.set_title("IceGame (Agent Lives: {}, UpTimes: {})".format(env.lives, env.sim.get_updated_counter())) ind = np.arange(num_actions) width = 0.20 #action_legends = ["Up", "Down", "Left", "Right", "NextUp", "NextDown", "Metropolis"] action_legends = [">", "v", "<", "^", "", "", "Metro"] for ep in range(args.num_episodes): """TODO: policy sampling strategy random, greedy and sampled policy. """ last_state = env.reset() steps_rewards=[] steps_values=[] # running policy while True: fetched = policy.act_inference(last_state, *last_features) prob_action, action, value_, features = fetched[0], fetched[1], fetched[2], fetched[3:] #TODO: policy sampling strategy # Greedy #print ("Prob of actions: {}".format(prob_action)) stepAct = action.argmax() state, reward, terminal, info = env.step(stepAct) # update stats length += 1 rewards += reward last_state = state last_features = features steps_rewards.append(rewards) steps_values.append(value_) if info: loopsize = info["Loop Size"] looparea = info["Loop Area"] """Animation for State and Actions """ ax2.clear() ax2.bar(ind, prob_action) ax2.set_xticks(ind + width / 2) ax2.set_xticklabels(action_legends) ax1.imshow(state[:,:,2], 'Reds', interpolation="None", vmin=-1, vmax=1) # with hist #ax1.imshow(state[:,:,7], 'Reds', interpolation="None", vmin=-1, vmax=1) ax1.set_title("IceGame: (Agent Lives: {}, UpTimes: {})".format(env.lives, env.sim.get_updated_counter())) ax3.clear() ax3.plot(steps_rewards, linewidth=2) ax3.plot(steps_values, linewidth=2) #plt.savefig("records/{}.png".format(length)) plt.pause(0.20) # store summary summary = tf.Summary() summary.value.add(tag='ep_{}/reward'.format(ep), simple_value=reward) summary.value.add(tag='ep_{}/netreward'.format(ep), simple_value=rewards) summary.value.add(tag='ep_{}/value'.format(ep), simple_value=float(value_[0])) if info: summary.value.add(tag='ep_{}/loop_size'.format(ep), simple_value=loopsize) summary.value.add(tag='ep_{}/loop_area'.format(ep), simple_value=looparea) summary_writer.add_summary(summary, length) summary_writer.flush() summary = tf.Summary() for ac_id in range(num_actions): summary.value.add(tag='ep_{}/a_{}'.format(ep, ac_id), simple_value=float(prob_action[ac_id])) action_writers[ac_id].add_summary(summary, length) action_writers[ac_id].flush() """TODO: 1. Need more concrete idea for playing the game when interfering. 2. Save these values for post processing. """ if terminal: #if length >= timestep_limit: # last_state, _, _, _ = env.reset() last_features = policy.get_initial_features() # reset lstm memory print("Episode finished. Sum of rewards: %.2f. Length: %d." % (rewards, length)) length = 0 rewards = 0 break logger.info('Finished %d true episodes.', args.num_episodes) plt.savefig("GameScene.png") logger.info("Save the last scene to GameScene.png") env.close()
def __init__(self, env, task, visualise, test=False): """ An implementation of the A3C algorithm that is reasonably well-tuned for the VNC environments. Below, we will have a modest amount of complexity due to the way TensorFlow handles data parallelism. But overall, we'll define the model, specify its inputs, and describe how the policy gradients step should be computed. """ self.env = env self.task = task self.meta_action_size = 32 worker_device = "/job:worker/task:{}/cpu:0".format(task) if test: worker_device = "/job:eval/task:{}/cpu:0".format(task) with tf.device( tf.train.replica_device_setter(1, worker_device=worker_device)): with tf.variable_scope("global"): self.network = LSTMPolicy(env.observation_space.shape, env.action_space.n, self.meta_action_size) self.global_step = tf.get_variable( "global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False) self.meta_network = MetaPolicy(env.observation_space.shape, self.meta_action_size) with tf.device(worker_device): with tf.variable_scope("local"): self.local_network = pi = LSTMPolicy( env.observation_space.shape, env.action_space.n, self.meta_action_size) self.local_meta_network = meta_pi = MetaPolicy( env.observation_space.shape, self.meta_action_size) pi.global_step = self.global_step self.ac = tf.placeholder(tf.float32, [None, env.action_space.n], name="ac") self.adv = tf.placeholder(tf.float32, [None], name="adv") self.r = tf.placeholder(tf.float32, [None], name="r") log_prob_tf = tf.nn.log_softmax(pi.logits) prob_tf = tf.nn.softmax(pi.logits) # the "policy gradients" loss: its derivative is precisely the policy gradient # notice that self.ac is a placeholder that is provided externally. # adv will contain the advantages, as calculated in process_rollout pi_loss = -tf.reduce_sum( tf.reduce_sum(log_prob_tf * self.ac, [1]) * self.adv) # loss of value function vf_loss = 0.5 * tf.reduce_sum(tf.square(pi.vf - self.r)) entropy = -tf.reduce_sum(prob_tf * log_prob_tf) bs = tf.to_float(tf.shape(pi.x)[0]) self.loss = pi_loss + 0.5 * vf_loss - entropy * 0.01 self.visualise = visualise grads = tf.gradients(self.loss, pi.var_list) actor_summary = [ tf.summary.scalar("model/policy_loss", pi_loss / bs), tf.summary.scalar("model/value_loss", vf_loss / bs), tf.summary.scalar("model/entropy", entropy / bs), tf.summary.image("model/state", pi.x), tf.summary.scalar("model/grad_global_norm", tf.global_norm(grads)), tf.summary.scalar("model/var_global_norm", tf.global_norm(pi.var_list)) ] self.summary_op = tf.summary.merge(actor_summary) grads, _ = tf.clip_by_global_norm(grads, 40.0) # This is sync ops which copy weights from shared space to the local. self.sync = tf.group(*([ v1.assign(v2) for v1, v2 in zip(pi.var_list, self.network.var_list) ])) grads_and_vars = list(zip(grads, self.network.var_list)) inc_step = self.global_step.assign_add(tf.shape(pi.x)[0]) # each worker has a different set of adam optimizer parameters opt = tf.train.AdamOptimizer(1e-4) self.train_op = tf.group(opt.apply_gradients(grads_and_vars), inc_step) self.summary_writer = None self.local_steps = 0 ################################### ########## META CONTROLLER ######## ################################### self.meta_ac = tf.placeholder(tf.float32, [None, self.meta_action_size], name="meta_ac") self.meta_adv = tf.placeholder(tf.float32, [None], name="meta_adv") self.meta_r = tf.placeholder(tf.float32, [None], name="meta_r") meta_log_prob_tf = tf.nn.log_softmax(meta_pi.logits) meta_prob_tf = tf.nn.softmax(meta_pi.logits) meta_pi_loss = -tf.reduce_sum( tf.reduce_sum(meta_log_prob_tf * self.meta_ac, [1]) * self.meta_adv) meta_vf_loss = 0.5 * tf.reduce_sum( tf.square(meta_pi.vf - self.meta_r)) # entropy meta_entropy = -tf.reduce_sum(meta_prob_tf * meta_log_prob_tf) meta_bs = tf.to_float(tf.shape(meta_pi.x)[0]) self.meta_loss = meta_pi_loss + 0.5 * meta_vf_loss - meta_entropy * 0.01 meta_grads = tf.gradients(self.meta_loss, meta_pi.var_list) meta_grads, _ = tf.clip_by_global_norm(meta_grads, 40.0) self.meta_sync = tf.group(*([ v1.assign(v2) for v1, v2 in zip(meta_pi.var_list, self.meta_network.var_list) ])) meta_grads_and_vars = list( zip(meta_grads, self.meta_network.var_list)) meta_opt = tf.train.AdamOptimizer(1e-4) self.meta_train_op = meta_opt.apply_gradients(meta_grads_and_vars) meta_summary = [ tf.summary.scalar("meta_model/policy_loss", meta_pi_loss / meta_bs), tf.summary.scalar("meta_model/value_loss", meta_vf_loss / meta_bs), tf.summary.scalar("meta_model/entropy", meta_entropy / meta_bs), tf.summary.scalar("meta_model/grad_global_norm", tf.global_norm(meta_grads)), tf.summary.scalar("meta_model/var_global_norm", tf.global_norm(meta_pi.var_list)) ] self.meta_summary_op = tf.summary.merge(meta_summary) self.beta = 0.75
def inference(args): """ It only restores LSTMPolicy architecture, and does inference using that. """ # get address of checkpoints indir = os.path.join(args.log_dir, 'train') outdir = os.path.join( args.log_dir, 'inference') if args.out_dir is None else args.out_dir with open(indir + '/checkpoint', 'r') as f: first_line = f.readline().strip() ckpt = first_line.split(' ')[-1].split('/')[-1][:-1] ckpt = ckpt.split('-')[-1] ckpt = indir + '/model.ckpt-' + ckpt # define environment if args.record: env = create_env(args.env_id, client_id='0', remotes=None, envWrap=args.envWrap, designHead=args.designHead, record=True, noop=args.noop, acRepeat=args.acRepeat, outdir=outdir) else: env = create_env(args.env_id, client_id='0', remotes=None, envWrap=args.envWrap, designHead=args.designHead, record=True, noop=args.noop, acRepeat=args.acRepeat) numaction = env.action_space.n with tf.device("/cpu:0"): # define policy network with tf.variable_scope("global"): policy = LSTMPolicy(env.observation_space.shape, numaction, args.designHead) policy.global_step = tf.get_variable( "global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False) # Variable names that start with "local" are not saved in checkpoints. if use_tf12_api: variables_to_restore = [ v for v in tf.global_variables() if not v.name.startswith("local") ] init_all_op = tf.global_variables_initializer() else: variables_to_restore = [ v for v in tf.all_variables() if not v.name.startswith("local") ] init_all_op = tf.initialize_all_variables() saver = FastSaver(variables_to_restore) # print trainable variables var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) logger.info('Trainable vars:') for v in var_list: logger.info(' %s %s', v.name, v.get_shape()) # summary of rewards action_writers = [] if use_tf12_api: summary_writer = tf.summary.FileWriter(outdir) for ac_id in range(numaction): action_writers.append( tf.summary.FileWriter( os.path.join(outdir, 'action_{}'.format(ac_id)))) else: summary_writer = tf.train.SummaryWriter(outdir) for ac_id in range(numaction): action_writers.append( tf.train.SummaryWriter( os.path.join(outdir, 'action_{}'.format(ac_id)))) logger.info("Inference events directory: %s", outdir) config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) with tf.Session(config=config) as sess: logger.info("Initializing all parameters.") sess.run(init_all_op) logger.info("Restoring trainable global parameters.") saver.restore(sess, ckpt) logger.info("Restored model was trained for %.2fM global steps", sess.run(policy.global_step) / 1000000.) #saving with meta graph: metaSaver = tf.train.Saver(variables_to_restore) metaSaver.save( sess, '/home/swagking0/noreward-rl/models/models_me/mario_me') last_state = env.reset() if args.render or args.record: env.render() last_features = policy.get_initial_features() # reset lstm memory length = 0 rewards = 0 mario_distances = np.zeros((args.num_episodes, )) for i in range(args.num_episodes): print("Starting episode %d" % (i + 1)) if args.recordSignal: from PIL import Image signalCount = 1 utils.mkdir_p(outdir + '/recordedSignal/ep_%02d/' % i) Image.fromarray( (255 * last_state[..., -1]).astype('uint8')).save( outdir + '/recordedSignal/ep_%02d/%06d.jpg' % (i, signalCount)) if args.random: print('I am random policy!') else: if args.greedy: print('I am greedy policy!') else: print('I am sampled policy!') while True: # run policy fetched = policy.act_inference(last_state, *last_features) prob_action, action, value_, features = fetched[ 0], fetched[1], fetched[2], fetched[3:] # run environment: sampled one-hot 'action' (not greedy) if args.random: stepAct = np.random.randint(0, numaction) # random policy else: if args.greedy: stepAct = prob_action.argmax() # greedy policy else: stepAct = action.argmax() # print(stepAct, prob_action.argmax(), prob_action) state, reward, terminal, info = env.step(stepAct) # update stats length += 1 rewards += reward last_state = state last_features = features if args.render or args.record: env.render() if args.recordSignal: signalCount += 1 Image.fromarray( (255 * last_state[..., -1]).astype('uint8')).save( outdir + '/recordedSignal/ep_%02d/%06d.jpg' % (i, signalCount)) # store summary summary = tf.Summary() summary.value.add(tag='ep_{}/reward'.format(i), simple_value=reward) summary.value.add(tag='ep_{}/netreward'.format(i), simple_value=rewards) summary.value.add(tag='ep_{}/value'.format(i), simple_value=float(value_[0])) if 'NoFrameskip-v' in args.env_id: # atari summary.value.add( tag='ep_{}/lives'.format(i), simple_value=env.unwrapped.ale.lives()) summary_writer.add_summary(summary, length) summary_writer.flush() summary = tf.Summary() for ac_id in range(numaction): summary.value.add(tag='action_prob', simple_value=float( prob_action[ac_id])) action_writers[ac_id].add_summary(summary, length) action_writers[ac_id].flush() timestep_limit = env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') if timestep_limit is None: timestep_limit = env.spec.timestep_limit if terminal or length >= timestep_limit: if length >= timestep_limit or not env.metadata.get( 'semantics.autoreset'): last_state = env.reset() last_features = policy.get_initial_features( ) # reset lstm memory print( "Episode finished. Sum of rewards: %.2f. Length: %d." % (rewards, length)) if 'distance' in info: print('Mario Distance Covered:', info['distance']) mario_distances[i] = info['distance'] length = 0 rewards = 0 if args.render or args.record: env.render() if args.recordSignal: signalCount += 1 Image.fromarray( (255 * last_state[..., -1]).astype('uint8')).save( outdir + '/recordedSignal/ep_%02d/%06d.jpg' % (i, signalCount)) break logger.info('Finished %d true episodes.', args.num_episodes) if 'distance' in info: print('Mario Distances:', mario_distances) np.save(outdir + '/distances.npy', mario_distances) env.close()
def __init__(self, env, task, visualise): """ An implementation of the A3C algorithm that is reasonably well-tuned for the VNC environments. Below, we will have a modest amount of complexity due to the way TensorFlow handles data parallelism. But overall, we'll define the model, specify its inputs, and describe how the policy gradients step should be computed. parameters: 1. env: environment 2. task: taks id 3. network: LSTMPolicy 4. global_step: variable that tracks global steps 5. local_network: copy of LSTMPolicy 6. ac: acter critic 7. adv: advantage place holder, single value 8. r: reward 9. loss: the loss value 10. runner """ self.env = env self.task = task #task id specifying which worker is working worker_device = "/job:worker/task:{}/cpu:0".format(task) #create single worker #while with tf.device(tf.train.replica_device_setter(1, worker_device=worker_device)): with tf.variable_scope("global"): # the input shpae is 128 x 200 x 1, size of the action is self.network = LSTMPolicy(env.observation_space.shape, env.action_space.n) self.global_step = tf.get_variable("global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False) with tf.device(worker_device): with tf.variable_scope("local"): #space dimension 128 x 200, action space size 7, if using neon race self.local_network = pi = LSTMPolicy(env.observation_space.shape, env.action_space.n) #this syncronize the worker with global step pi.global_step = self.global_step #placeholder for variables self.ac = tf.placeholder(tf.float32, [None, env.action_space.n], name="ac") self.adv = tf.placeholder(tf.float32, [None], name="adv") self.r = tf.placeholder(tf.float32, [None], name="r") # probability of actions log_prob_tf = tf.nn.log_softmax(pi.logits) prob_tf = tf.nn.softmax(pi.logits) # the "policy gradients" loss: its derivative is precisely the policy gradient # notice that self.ac is a placeholder that is provided externally. # adv will contain the advantages, as calculated in process_rollout pi_loss = - tf.reduce_sum(tf.reduce_sum(log_prob_tf * self.ac, [1]) * self.adv) # loss of value function vf_loss = 0.5 * tf.reduce_sum(tf.square(pi.vf - self.r)) # sum of squared error which is defined as state_value - actual_reward entropy = - tf.reduce_sum(prob_tf * log_prob_tf) bs = tf.to_float(tf.shape(pi.x)[0]) # this is the total loss function self.loss = pi_loss + 0.5 * vf_loss - entropy * 0.01 # 20 represents the number of "local steps": the number of timesteps # we run the policy before we update the parameters. # The larger local steps is, the lower is the variance in our policy gradients estimate # on the one hand; but on the other hand, we get less frequent parameter updates, which # slows down learning. In this code, we found that making local steps be much # smaller than 20 makes the algorithm more difficult to tune and to get to work. #t_max = 20, looking a head of 20 steps self.runner = RunnerThread(env, pi, 20, visualise) grads = tf.gradients(self.loss, pi.var_list) #save summary if use_tf12_api: tf.summary.scalar("model/policy_loss", pi_loss / bs) tf.summary.scalar("model/value_loss", vf_loss / bs) tf.summary.scalar("model/entropy", entropy / bs) tf.summary.image("model/state", pi.x) tf.summary.scalar("model/grad_global_norm", tf.global_norm(grads)) tf.summary.scalar("model/var_global_norm", tf.global_norm(pi.var_list)) self.summary_op = tf.summary.merge_all() else: tf.scalar_summary("model/policy_loss", pi_loss / bs) tf.scalar_summary("model/value_loss", vf_loss / bs) tf.scalar_summary("model/entropy", entropy / bs) tf.image_summary("model/state", pi.x) tf.scalar_summary("model/grad_global_norm", tf.global_norm(grads)) tf.scalar_summary("model/var_global_norm", tf.global_norm(pi.var_list)) self.summary_op = tf.merge_all_summaries() # perform gradient clipping # https://hackernoon.com/gradient-clipping-57f04f0adae grads, _ = tf.clip_by_global_norm(grads, 40.0) # copy weights from the parameter server to the local model self.sync = tf.group(*[v1.assign(v2) for v1, v2 in zip(pi.var_list, self.network.var_list)]) # this will generate a tuple, where each of elements is in the form of (gradient, variable_name) grads_and_vars = list(zip(grads, self.network.var_list)) inc_step = self.global_step.assign_add(tf.shape(pi.x)[0]) # each worker has a different set of adam optimizer parameters opt = tf.train.AdamOptimizer(1e-4) self.train_op = tf.group(opt.apply_gradients(grads_and_vars), inc_step) self.summary_writer = None self.local_steps = 0
def init(env): policy = LSTMPolicy(env.observation_space.shape, env.action_space.n) # Load this from training snapshot return policy