def main(arg): cluster = tf.train.ClusterSpec( {"local": ["localhost:2222", "localhost:2223", "localhost:2224"]}) server = tf.train.Server(cluster, job_name="local", task_index=FLAGS.task_index) is_chief = (FLAGS.task_index == 0) # Assigns ops to the local worker by default. # note: this automatically sets the device on which ops/vars are stored # ops are stored on the local worker running this code, vars are stored # on param server - so each worker has network copy but actual weight # values are shared through paramserver with tf.device("/job:local/task:%d" % FLAGS.task_index): env = gym.make(FLAGS.env) # !!!PONG specific: only use up and down actions = np.array([0, 1]) # env.action_space n_actions = actions.size running_reward = tf.Variable(0., name="running_reward") tf.scalar_summary("Running Reward", running_reward) summary_op = tf.merge_all_summaries() model = two_layer_net(4, 20, 2) S, A, Adv = model['input_ph'], model['actions_ph'], model[ 'advantage_ph'] net, optimizer, loss = model['net'], model['optimizer'], model['loss'] saver, init_op, global_step = model['saver'], model['init_op'], model[ 'global_step'] # Create a "supervisor", which oversees the training process. logdir = "./tmp/train_logs/" sv = tf.train.Supervisor( is_chief=is_chief, logdir=logdir, init_op=init_op, summary_op=None, # disable summary thread;crashes saver=saver, global_step=global_step, save_model_secs=5) # The supervisor takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs. print server.target with sv.managed_session(server.target) as sess: print "waiting for 10 secs to get a checkpoint saved" import time time.sleep(5) writer = tf.train.SummaryWriter(logdir, graph=sess.graph) step = 0 # global training steps avg_reward = None # running average episode reward local_step = 0 # local training steps performed n_e = 0 # Loop until the supervisor shuts down or max steps have completed. while not sv.should_stop() and step < FLAGS.num_steps: # get data by interacting with env using current policy obs, acts, returns = None, None, None rwds = 0 for e in range(FLAGS.num_episodes): # get a single episode o_n, a_n, r_n = run_episode(env, model, server, logdir, actions, n_e) # get episode discounted return disc_r = discount_rewards(r_n, FLAGS.gamma) disc_r -= np.mean(disc_r) disc_r /= np.std(disc_r) # store results r = np.sum(r_n) rwds += r avg_reward = r if avg_reward is None\ else .99 * avg_reward + .01 * r obs = o_n if obs is None else np.append(obs, o_n, axis=0) acts = a_n if acts is None else np.append(acts, a_n) returns = disc_r if returns is None else np.append( returns, disc_r) n_e += 1 # feed trajectories to pg training _, step_loss, step, summary = sess.run( [optimizer, loss, global_step, summary_op], feed_dict={ S: obs, A: acts, Adv: returns, running_reward: avg_reward }) writer.add_summary(summary, n_e) num = FLAGS.num_episodes print('step %d, rew %1.3f, avg reward for %d episodes is %1.3f' % (local_step, avg_reward, num, rwds / num)) local_step += 1 # Ask for all the services to stop. print('stopped') sv.stop()
def run_episode(env, model, server, checkpoint_dir, actions, n_e): ''' Generate trajectory by running single episode Runs a single episode in gym environment using a neural network policy. Returns observation, action, reward trajectories Args: env: Gym environment net: policy network, takes observation as input, returns action probs session: tf session to run net in s_placeholder: input variable for net actions: action set for env Returns: tuple: ndarrays containing observations, actions and rewards respectively ''' ckpt = tf.train.get_checkpoint_state(checkpoint_dir) graph = tf.Graph() sess = tf.Session(graph=graph) #model = two_layer_net(4,200,2) #net, saver = model['net'], model['saver'] #input_ph = model['input_ph'] with sess.graph.as_default(): model = two_layer_net(80 * 80, 200, 2) net, saver = model['net'], model['saver'] input_ph = model['input_ph'] if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) else: raise Exception("No checkpoint found") observation = env.reset() prev_x = None xs, acts, rs = [], [], [] done = False while not done: # TODO: implement deepmind preprocessing with histories # preprocess the current observation cur_x = prepro(observation) # set input to network to be difference image x = cur_x - prev_x if prev_x is not None else np.zeros((1, cur_x.size)) prev_x = cur_x feed_dict = {input_ph: x} # forward the policy network to get action probs #print "Before forward pass" #t1 = time.time() aprob = sess.run(net, feed_dict=feed_dict) #t2 = time.time() #print "%1.3f After forward pass" % (t2-t1) # sample action using probs act = np.random.choice(actions.size, p=aprob.flatten()) # record trajectory xs.append(x) # observation acts.append(act) # action (index) # step the environment and get new measurements observation, reward, done, info = env.step(actions[act]) # record reward # note: has to be done after we call step() # to get reward for previous action rs.append(reward) if reward != 0: # Pong has either +1 or -1 reward exactly when game ends. print('ep %d: game finished, reward: %f' % (n_e, reward)) + ('' if reward == -1 else ' !!!!!!!!') return [np.vstack(xs), np.array(acts), np.array(rs)]
def main(arg): if not FLAGS.aws: ps_hosts = FLAGS.ps_hosts.split(",") worker_hosts = FLAGS.worker_hosts.split(",") # Create a cluster from the parameter server and worker hosts. cluster = tf.train.ClusterSpec({ "ps": ps_hosts, "worker": worker_hosts }) # Create and start a server for the local task. job_name = FLAGS.job_name task_index = FLAGS.task_index server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) else: config = cluster_config(FLAGS) cluster = tf.train.ClusterSpec({ "ps": config['ps_hosts'], "worker": config['worker_hosts'] }) job_name = config['job'] task_index = config['task_id'] server = tf.train.Server(cluster, job_name=config['job'], task_index=config['task_id']) if job_name == "ps": server.join() elif job_name == "worker": is_chief = (task_index == 0) # Assigns ops to the local worker by default. # note: this automatically sets the device on which ops/vars are stored # ops are stored on the local worker running this code, vars are stored # on param server - so each worker has network copy but actual weight # values are shared through paramserver with tf.device(tf.train.replica_device_setter(cluster=cluster)): env, env_name = gym.make(FLAGS.env), FLAGS.env check_process = FLAGS.preprocess pre = preprocess if check_process else None actions = np.array(map(int, FLAGS.actions.split(','))) n_actions = actions.size running_reward = tf.placeholder(tf.float32, name="running_reward") tf.scalar_summary("Running Reward", running_reward) summary_op = tf.merge_all_summaries() model = two_layer_net(FLAGS.inp_dim, 200, FLAGS.out_dim) S, A, Adv = model['input_ph'], model['actions_ph'], model[ 'advantage_ph'] net, optimizer, loss = model['net'], model['optimizer'], model[ 'loss'] gradients_op, grads_buffer_ph = model['gradients'], model[ 'grads_buffer_ph'] network_params = model['network_params'] saver, init_op, global_step = model['saver'], model[ 'init_op'], model['global_step'] # Create a "supervisor", which oversees the training process. logdir = "./" + env_name + "_train_logs/" sv = tf.train.Supervisor( is_chief=is_chief, logdir=logdir, init_op=init_op, summary_op=None, # disable summary thread;crashes saver=saver, global_step=global_step, save_model_secs=200) # The supervisor takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs. with sv.managed_session(server.target) as sess: print "waiting for 10 secs to get a checkpoint saved" import time time.sleep(5) writer = tf.train.SummaryWriter(logdir, graph=tf.get_default_graph()) step = 0 # global training steps avg_reward = None # running average episode reward local_step = 0 # local training steps performed n_e = 0 grads_buffer = sess.run(network_params) # Loop until the supervisor shuts down or max steps have completed. while not sv.should_stop() and step < FLAGS.num_steps: # get data by interacting with env using current policy obs, acts, returns = None, None, None rwds = 0 for i, grad in enumerate(grads_buffer): grads_buffer[i] = grad * 0 for e in range(FLAGS.num_episodes): # get a single episode o_n, a_n, r_n = run_episode(env, model, server, logdir, actions, preprocess) # get episode discounted return disc_r = discount_rewards(r_n, FLAGS.gamma, env_name) disc_r -= np.mean(disc_r) disc_r /= np.std(disc_r) # store results r = np.sum(r_n) rwds += r avg_reward = r if avg_reward is None\ else .99 * avg_reward + .01 * r gradients = sess.run(gradients_op, feed_dict={ S: o_n, A: a_n, Adv: disc_r }) for i, grad in enumerate(gradients): grads_buffer[i] += grad n_e += 1 # feed trajectories to pg training feed_dict = {} for i, grad in enumerate(grads_buffer_ph): feed_dict[grads_buffer_ph[i]] = grads_buffer[i] feed_dict[running_reward] = avg_reward _, step, summary = sess.run( [optimizer, global_step, summary_op], feed_dict=feed_dict) writer.add_summary(summary, n_e) num = FLAGS.num_episodes print( 'step %d, rew %1.3f, avg reward for %d episodes is %1.3f' % (local_step, avg_reward, num, rwds / num)) local_step += 1 # Ask for all the services to stop. print('stopped') sv.stop()
def run_episode(env, model, server, checkpoint_dir, actions, n_e): ''' Generate trajectory by running single episode Runs a single episode in gym environment using a neural network policy. Returns observation, action, reward trajectories Args: env: Gym environment net: policy network, takes observation as input, returns action probs session: tf session to run net in s_placeholder: input variable for net actions: action set for env Returns: tuple: ndarrays containing observations, actions and rewards respectively ''' ckpt = tf.train.get_checkpoint_state(checkpoint_dir) graph = tf.Graph() sess = tf.Session(graph=graph) #model = two_layer_net(4,200,2) #net, saver = model['net'], model['saver'] #input_ph = model['input_ph'] with sess.graph.as_default(): model = two_layer_net(4, 20, 2) net, saver = model['net'], model['saver'] input_ph = model['input_ph'] if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) else: raise Exception("No checkpoint found") # print "BATMAN Checking variables for %d" % ((not is_chief)+1) # print sess.run(tf.trainable_variables()[0]) # print "BATMAN Done checking" observation = env.reset() prev_x = None xs, acts, rs = [], [], [] done = False while not done: # TODO: implement deepmind preprocessing with histories # preprocess the current observation cur_x = observation.reshape((1, observation.shape[0])) # set input to network to be difference image #x = cur_x - prev_x if prev_x is not None else np.zeros((1, cur_x.size)) prev_x = cur_x x = cur_x feed_dict = {input_ph: x} # forward the policy network to get action probs aprob = sess.run(net, feed_dict=feed_dict) # sample action using probs act = np.random.choice(actions.size, p=aprob.flatten()) # record trajectory xs.append(x) # observation acts.append(act) # action (index) # step the environment and get new measurements observation, reward, done, info = env.step(actions[act]) # record reward # note: has to be done after we call step() # to get reward for previous action rs.append(reward) return [np.vstack(xs), np.array(acts), np.array(rs)]