Example #1
0
def main(arg):
    cluster = tf.train.ClusterSpec(
        {"local": ["localhost:2222", "localhost:2223", "localhost:2224"]})
    server = tf.train.Server(cluster,
                             job_name="local",
                             task_index=FLAGS.task_index)

    is_chief = (FLAGS.task_index == 0)

    # Assigns ops to the local worker by default.
    # note: this automatically sets the device on which ops/vars are stored
    # ops are stored on the local worker running this code, vars are stored
    # on param server - so each worker has network copy but actual weight
    # values are shared through paramserver
    with tf.device("/job:local/task:%d" % FLAGS.task_index):
        env = gym.make(FLAGS.env)
        # !!!PONG specific: only use up and down
        actions = np.array([0, 1])  # env.action_space
        n_actions = actions.size

        running_reward = tf.Variable(0., name="running_reward")
        tf.scalar_summary("Running Reward", running_reward)
        summary_op = tf.merge_all_summaries()
        model = two_layer_net(4, 20, 2)

        S, A, Adv = model['input_ph'], model['actions_ph'], model[
            'advantage_ph']
        net, optimizer, loss = model['net'], model['optimizer'], model['loss']
        saver, init_op, global_step = model['saver'], model['init_op'], model[
            'global_step']

    # Create a "supervisor", which oversees the training process.
    logdir = "./tmp/train_logs/"
    sv = tf.train.Supervisor(
        is_chief=is_chief,
        logdir=logdir,
        init_op=init_op,
        summary_op=None,  # disable summary thread;crashes
        saver=saver,
        global_step=global_step,
        save_model_secs=5)

    # The supervisor takes care of session initialization, restoring from
    # a checkpoint, and closing when done or an error occurs.
    print server.target
    with sv.managed_session(server.target) as sess:
        print "waiting for 10 secs to get a checkpoint saved"
        import time
        time.sleep(5)

        writer = tf.train.SummaryWriter(logdir, graph=sess.graph)
        step = 0  # global training steps
        avg_reward = None  # running average episode reward
        local_step = 0  # local training steps performed
        n_e = 0
        # Loop until the supervisor shuts down or max steps have completed.
        while not sv.should_stop() and step < FLAGS.num_steps:
            # get data by interacting with env using current policy
            obs, acts, returns = None, None, None
            rwds = 0
            for e in range(FLAGS.num_episodes):
                # get a single episode
                o_n, a_n, r_n = run_episode(env, model, server, logdir,
                                            actions, n_e)
                # get episode discounted return
                disc_r = discount_rewards(r_n, FLAGS.gamma)
                disc_r -= np.mean(disc_r)
                disc_r /= np.std(disc_r)
                # store results
                r = np.sum(r_n)
                rwds += r
                avg_reward = r if avg_reward is None\
                    else .99 * avg_reward + .01 * r
                obs = o_n if obs is None else np.append(obs, o_n, axis=0)
                acts = a_n if acts is None else np.append(acts, a_n)
                returns = disc_r if returns is None else np.append(
                    returns, disc_r)
                n_e += 1

            # feed trajectories to pg training
            _, step_loss, step, summary = sess.run(
                [optimizer, loss, global_step, summary_op],
                feed_dict={
                    S: obs,
                    A: acts,
                    Adv: returns,
                    running_reward: avg_reward
                })
            writer.add_summary(summary, n_e)
            num = FLAGS.num_episodes
            print('step %d, rew %1.3f, avg reward for %d episodes is %1.3f' %
                  (local_step, avg_reward, num, rwds / num))
            local_step += 1
        # Ask for all the services to stop.
        print('stopped')
        sv.stop()
Example #2
0
def run_episode(env, model, server, checkpoint_dir, actions, n_e):
    ''' Generate trajectory by running single episode

    Runs a single episode in gym environment using a neural network policy.
    Returns observation, action, reward trajectories

    Args:
        env: Gym environment
        net: policy network, takes observation as input, returns action probs
        session: tf session to run net in
        s_placeholder: input variable for net
        actions: action set for env

    Returns:

        tuple: ndarrays containing observations, actions and rewards
            respectively
    '''

    ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
    graph = tf.Graph()
    sess = tf.Session(graph=graph)
    #model = two_layer_net(4,200,2)
    #net, saver = model['net'], model['saver']
    #input_ph = model['input_ph']

    with sess.graph.as_default():
        model = two_layer_net(80 * 80, 200, 2)
        net, saver = model['net'], model['saver']
        input_ph = model['input_ph']
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            raise Exception("No checkpoint found")

    observation = env.reset()
    prev_x = None
    xs, acts, rs = [], [], []
    done = False
    while not done:
        # TODO: implement deepmind preprocessing with histories
        # preprocess the current observation
        cur_x = prepro(observation)
        # set input to network to be difference image
        x = cur_x - prev_x if prev_x is not None else np.zeros((1, cur_x.size))
        prev_x = cur_x
        feed_dict = {input_ph: x}

        # forward the policy network to get action probs
        #print "Before forward pass"
        #t1 = time.time()
        aprob = sess.run(net, feed_dict=feed_dict)
        #t2 = time.time()
        #print "%1.3f After forward pass" % (t2-t1)
        # sample action using probs
        act = np.random.choice(actions.size, p=aprob.flatten())

        # record trajectory
        xs.append(x)  # observation
        acts.append(act)  # action (index)

        # step the environment and get new measurements
        observation, reward, done, info = env.step(actions[act])
        # record reward
        # note: has to be done after we call step()
        # to get reward for previous action
        rs.append(reward)

        if reward != 0:  # Pong has either +1 or -1 reward exactly when game ends.
            print('ep %d: game finished, reward: %f' %
                  (n_e, reward)) + ('' if reward == -1 else ' !!!!!!!!')
    return [np.vstack(xs), np.array(acts), np.array(rs)]
Example #3
0
def main(arg):
    if not FLAGS.aws:
        ps_hosts = FLAGS.ps_hosts.split(",")
        worker_hosts = FLAGS.worker_hosts.split(",")

        # Create a cluster from the parameter server and worker hosts.
        cluster = tf.train.ClusterSpec({
            "ps": ps_hosts,
            "worker": worker_hosts
        })

        # Create and start a server for the local task.
        job_name = FLAGS.job_name
        task_index = FLAGS.task_index
        server = tf.train.Server(cluster,
                                 job_name=FLAGS.job_name,
                                 task_index=FLAGS.task_index)

    else:
        config = cluster_config(FLAGS)
        cluster = tf.train.ClusterSpec({
            "ps": config['ps_hosts'],
            "worker": config['worker_hosts']
        })
        job_name = config['job']
        task_index = config['task_id']
        server = tf.train.Server(cluster,
                                 job_name=config['job'],
                                 task_index=config['task_id'])

    if job_name == "ps":
        server.join()
    elif job_name == "worker":
        is_chief = (task_index == 0)

        # Assigns ops to the local worker by default.
        # note: this automatically sets the device on which ops/vars are stored
        # ops are stored on the local worker running this code, vars are stored
        # on param server - so each worker has network copy but actual weight
        # values are shared through paramserver
        with tf.device(tf.train.replica_device_setter(cluster=cluster)):
            env, env_name = gym.make(FLAGS.env), FLAGS.env
            check_process = FLAGS.preprocess
            pre = preprocess if check_process else None
            actions = np.array(map(int, FLAGS.actions.split(',')))
            n_actions = actions.size

            running_reward = tf.placeholder(tf.float32, name="running_reward")
            tf.scalar_summary("Running Reward", running_reward)
            summary_op = tf.merge_all_summaries()
            model = two_layer_net(FLAGS.inp_dim, 200, FLAGS.out_dim)

            S, A, Adv = model['input_ph'], model['actions_ph'], model[
                'advantage_ph']
            net, optimizer, loss = model['net'], model['optimizer'], model[
                'loss']
            gradients_op, grads_buffer_ph = model['gradients'], model[
                'grads_buffer_ph']
            network_params = model['network_params']
            saver, init_op, global_step = model['saver'], model[
                'init_op'], model['global_step']

        # Create a "supervisor", which oversees the training process.
        logdir = "./" + env_name + "_train_logs/"
        sv = tf.train.Supervisor(
            is_chief=is_chief,
            logdir=logdir,
            init_op=init_op,
            summary_op=None,  # disable summary thread;crashes
            saver=saver,
            global_step=global_step,
            save_model_secs=200)

        # The supervisor takes care of session initialization, restoring from
        # a checkpoint, and closing when done or an error occurs.
        with sv.managed_session(server.target) as sess:
            print "waiting for 10 secs to get a checkpoint saved"
            import time
            time.sleep(5)

            writer = tf.train.SummaryWriter(logdir,
                                            graph=tf.get_default_graph())
            step = 0  # global training steps
            avg_reward = None  # running average episode reward
            local_step = 0  # local training steps performed
            n_e = 0
            grads_buffer = sess.run(network_params)

            # Loop until the supervisor shuts down or max steps have completed.
            while not sv.should_stop() and step < FLAGS.num_steps:
                # get data by interacting with env using current policy
                obs, acts, returns = None, None, None
                rwds = 0

                for i, grad in enumerate(grads_buffer):
                    grads_buffer[i] = grad * 0

                for e in range(FLAGS.num_episodes):
                    # get a single episode
                    o_n, a_n, r_n = run_episode(env, model, server, logdir,
                                                actions, preprocess)
                    # get episode discounted return
                    disc_r = discount_rewards(r_n, FLAGS.gamma, env_name)
                    disc_r -= np.mean(disc_r)
                    disc_r /= np.std(disc_r)
                    # store results
                    r = np.sum(r_n)
                    rwds += r
                    avg_reward = r if avg_reward is None\
                        else .99 * avg_reward + .01 * r

                    gradients = sess.run(gradients_op,
                                         feed_dict={
                                             S: o_n,
                                             A: a_n,
                                             Adv: disc_r
                                         })
                    for i, grad in enumerate(gradients):
                        grads_buffer[i] += grad

                    n_e += 1

                # feed trajectories to pg training
                feed_dict = {}
                for i, grad in enumerate(grads_buffer_ph):
                    feed_dict[grads_buffer_ph[i]] = grads_buffer[i]
                feed_dict[running_reward] = avg_reward

                _, step, summary = sess.run(
                    [optimizer, global_step, summary_op], feed_dict=feed_dict)
                writer.add_summary(summary, n_e)
                num = FLAGS.num_episodes
                print(
                    'step %d, rew %1.3f, avg reward for %d episodes is %1.3f' %
                    (local_step, avg_reward, num, rwds / num))
                local_step += 1
            # Ask for all the services to stop.
            print('stopped')
            sv.stop()
Example #4
0
def run_episode(env, model, server, checkpoint_dir, actions, n_e):
    ''' Generate trajectory by running single episode

    Runs a single episode in gym environment using a neural network policy.
    Returns observation, action, reward trajectories

    Args:
        env: Gym environment
        net: policy network, takes observation as input, returns action probs
        session: tf session to run net in
        s_placeholder: input variable for net
        actions: action set for env

    Returns:

        tuple: ndarrays containing observations, actions and rewards
            respectively
    '''

    ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
    graph = tf.Graph()
    sess = tf.Session(graph=graph)
    #model = two_layer_net(4,200,2)
    #net, saver = model['net'], model['saver']
    #input_ph = model['input_ph']

    with sess.graph.as_default():
        model = two_layer_net(4, 20, 2)
        net, saver = model['net'], model['saver']
        input_ph = model['input_ph']
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            raise Exception("No checkpoint found")

    # print "BATMAN Checking variables for %d" % ((not is_chief)+1)
    # print sess.run(tf.trainable_variables()[0])
    # print "BATMAN Done checking"

    observation = env.reset()
    prev_x = None
    xs, acts, rs = [], [], []
    done = False
    while not done:
        # TODO: implement deepmind preprocessing with histories
        # preprocess the current observation
        cur_x = observation.reshape((1, observation.shape[0]))
        # set input to network to be difference image
        #x = cur_x - prev_x if prev_x is not None else np.zeros((1, cur_x.size))
        prev_x = cur_x
        x = cur_x
        feed_dict = {input_ph: x}

        # forward the policy network to get action probs
        aprob = sess.run(net, feed_dict=feed_dict)
        # sample action using probs
        act = np.random.choice(actions.size, p=aprob.flatten())

        # record trajectory
        xs.append(x)  # observation
        acts.append(act)  # action (index)

        # step the environment and get new measurements
        observation, reward, done, info = env.step(actions[act])
        # record reward
        # note: has to be done after we call step()
        # to get reward for previous action
        rs.append(reward)

    return [np.vstack(xs), np.array(acts), np.array(rs)]