def new_env(args):
    config = open(args.config) if args.config != "" else None
    env = create_env(args.env_id,
        str(args.task), 
        args.remotes,
        config=config)
    return env
Exemple #2
0
def show(shared_model, global_steps, args):
    setproctitle('{}:show'.format(args.name))

    try:
        env = create_env(args.game_type, args.env_name, 'show', 1)
        model = copy.deepcopy(shared_model)
        gpu_id = args.gpu_ids[-2]
        with torch.cuda.device(gpu_id):
            model = model.cuda() if gpu_id >= 0 else model
        model.eval()

        while True:
            # Sync with the shared model
            with torch.cuda.device(gpu_id):
                model.load_state_dict(shared_model.state_dict())
            model.reset()
            play_game(env,
                      model,
                      args.max_episode_length,
                      render=True,
                      gpu_id=gpu_id)

            if global_steps.value >= args.max_global_steps:
                break
    except KeyboardInterrupt:
        raise
    finally:
        print('Player Finished !!!')
Exemple #3
0
def new_env(args):
    config = args.config
    if isinstance(args.config, str):
        config = open(args.config) if args.config != "" else None
        config = bs4.BeautifulSoup(config, "lxml")

    env = create_env(args.env_id, str(args.task), args.remotes, config=config)
    return env
Exemple #4
0
def run(args):
    env = create_env(args.env_id,
                     client_id=str(args.task),
                     remotes=args.remotes)
    trainer = A3C(env, args.task, args.visualise, args.num_workers,
                  args.worker_id, args.verbose_lvl)

    # Variable names that start with "local" are not saved in checkpoints.
    if use_tf12_api:
        variables_to_save = [
            v for v in tf.global_variables() if not v.name.startswith("local")
        ]
        init_op = tf.variables_initializer(variables_to_save)
        init_all_op = tf.global_variables_initializer()
    else:
        variables_to_save = [
            v for v in tf.all_variables() if not v.name.startswith("local")
        ]
        init_op = tf.initialize_variables(variables_to_save)
        init_all_op = tf.initialize_all_variables()
    print variables_to_save
    saver = FastSaver(variables_to_save)

    var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                 tf.get_variable_scope().name)
    logger.info('Trainable vars:')
    for v in var_list:
        logger.info('  %s %s', v.name, v.get_shape())

    def init_fn(ses):
        logger.info("Initializing all parameters.")
        ses.run(init_all_op)

    logdir = os.path.join(args.log_dir, 'train')

    if use_tf12_api:
        summary_writer = tf.summary.FileWriter(logdir + "_%d" % args.task)
    else:
        summary_writer = tf.train.SummaryWriter(logdir + "_%d" % args.task)

    logger.info("Events directory: %s_%s", logdir, args.task)

    logger.info(
        "Starting session. If this hangs, we're mostly likely waiting to connect to the parameter server. "
        +
        "One common cause is that the parameter server DNS name isn't resolving yet, or is misspecified."
    )
    with tf.Session() as sess:
        init = tf.global_variables_initializer()
        sess.run(init)

        trainer.start_listen_thread()
        trainer.sync_initial_weights(sess, var_list)
        trainer.start(sess, summary_writer)
        while True:
            trainer.process(sess)
def run(args):
    env = create_env(args.env_id)
    trainer = A3C(env, None, args.visualise, args.intrinsic_type, args.bptt)

    # Variable names that start with "local" are not saved in checkpoints.
    variables_to_save = [v for v in tf.global_variables() if not v.name.startswith("local")]
    init_op = tf.variables_initializer(variables_to_save)
    init_all_op = tf.global_variables_initializer()
    saver = FastSaver(variables_to_save)

    var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)
    logger.info('Trainable vars:')
    for v in var_list:
        logger.info('  %s %s', v.name, v.get_shape())

    def init_fn(ses):
        logger.info("Initializing all parameters.")
        ses.run(init_all_op)

    logdir = os.path.join(args.log_dir, 'train')
    summary_writer = tf.summary.FileWriter(logdir)
    logger.info("Events directory: %s", logdir)

    sv = tf.train.Supervisor(is_chief=True,
                             logdir=logdir,
                             saver=saver,
                             summary_op=None,
                             init_op=init_op,
                             init_fn=init_fn,
                             summary_writer=summary_writer,
                             ready_op=tf.report_uninitialized_variables(variables_to_save),
                             global_step=None,
                             save_model_secs=0,
                             save_summaries_secs=0)

    video_dir = os.path.join(args.log_dir, 'test_videos_' + args.intrinsic_type)
    if not os.path.exists(video_dir):
        os.makedirs(video_dir)
    video_filename = video_dir + "/%s_%02d_%d.gif"
    print("Video saved at %s" % video_dir)

    with sv.managed_session() as sess, sess.as_default():
        trainer.start(sess, summary_writer)
        rewards = []
        lengths = []
        for i in range(10):
            frames, reward, length = trainer.evaluate(sess)
            rewards.append(reward)
            lengths.append(length)
            imageio.mimsave(video_filename % (args.env_id, i, reward), frames, fps=30)

        print('Evaluation: avg. reward %.2f    avg.length %.2f' %
              (sum(rewards) / 10.0, sum(lengths) / 10.0))

    # Ask for all the services to stop.
    sv.stop()
Exemple #6
0
def main():
    args = parse_args()
    env = envs.create_env(args.domain, args.task, args.verbose)
    agent = agents.create_agent(args.model, env, args.verbose)

    if args.train:
        agent.train(env, args.save_model, args.verbose, args.display,
                    args.save_training_curve)
    elif args.eval:
        agent.eval(env, args.verbose, args.display)
def run(args, server):
    env = create_env(args.env_id, client_id=str(args.task), remotes=args.remotes)
    trainer = A3C(env, args.task, args.visualise)

    # Variable names that start with "local" are not saved in checkpoints.
    variables_to_save = [v for v in tf.global_variables() if not v.name.startswith("local")]
    init_op = tf.variables_initializer(variables_to_save)
    init_all_op = tf.global_variables_initializer()
    
    saver = FastSaver(variables_to_save)

    var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)
    logger.info('Trainable vars:')
    for v in var_list:
        logger.info('  %s %s', v.name, v.get_shape())

    def init_fn(ses):
        logger.info("Initializing all parameters.")
        ses.run(init_all_op)

    config = tf.ConfigProto(device_filters=["/job:ps", "/job:worker/task:{}/cpu:0".format(args.task)])
    logdir = os.path.join(args.log_dir, 'train')

    summary_writer = tf.summary.FileWriter(logdir + "_%d" % args.task)

    logger.info("Events directory: %s_%s", logdir, args.task)
    sv = tf.train.Supervisor(is_chief=(args.task == 0),
                             logdir=logdir,
                             saver=saver,
                             summary_op=None,
                             init_op=init_op,
                             init_fn=init_fn,
                             summary_writer=summary_writer,
                             ready_op=tf.report_uninitialized_variables(variables_to_save),
                             global_step=trainer.global_step,
                             save_model_secs=30,
                             save_summaries_secs=30)

    num_global_steps = 100000000

    logger.info(
        "Starting session. If this hangs, we're mostly likely waiting to connect to the parameter server. " +
        "One common cause is that the parameter server DNS name isn't resolving yet, or is misspecified.")
    with sv.managed_session(server.target, config=config) as sess, sess.as_default():
        sess.run(trainer.sync)
        trainer.start(sess, summary_writer)
        global_step = sess.run(trainer.global_step)
        logger.info("Starting training at step=%d", global_step)
        while not sv.should_stop() and (not num_global_steps or global_step < num_global_steps):
            trainer.process(sess)
            global_step = sess.run(trainer.global_step)

    # Ask for all the services to stop.
    sv.stop()
    logger.info('reached %s steps. worker stopped.', global_step)
def new_env(args):
    config = open(args.config) if args.config != "" else None
    env = create_env(args.env_id, str(args.task), args.remotes, config=config)
    if args.env_id == 'maze':
        return env

    path = os.getcwd() + '/tmp/vpn_record_1'
    if os.path.exists(path):
        print "removing old directory" + path
    env = wrappers.Monitor(env, path)
    return env
Exemple #9
0
 def __init__(self, env_name, actor_id, logdir="results/", start=True):
     env = create_env(env_name)
     self.id = actor_id
     num_actions = env.action_space.n
     self.policy = LSTMPolicy(env.observation_space.shape, num_actions,
                              actor_id)
     self.runner = RunnerThread(env, self.policy, 20)
     self.env = env
     self.logdir = logdir
     if start:
         self.start()
Exemple #10
0
def run(args, server):
    env = create_env(args.env_id, client_id=str(args.task), remotes=args.remotes, num_trials=args.num_trials)

    trainer = A3C(env, args.task, args.visualise, args.meta, args.remotes, args.num_trials)

    # log, checkpoints et tensorboard

    # (Original Comment) Variable names that start with "local" are not saved in checkpoints.
    if use_tf12_api:
        variables_to_save = [v for v in tf.global_variables() if not v.name.startswith("local")]
        init_op = tf.variables_initializer(variables_to_save)
        init_all_op = tf.global_variables_initializer()
    else:
        variables_to_save = [v for v in tf.all_variables() if not v.name.startswith("local")]
        init_op = tf.initialize_variables(variables_to_save)
        init_all_op = tf.initialize_all_variables()
    saver = FastSaver(variables_to_save)

    var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)
    logger.info('Trainable vars:')
    for v in var_list:
        logger.info('  %s %s', v.name, v.get_shape())

    def init_fn(ses):
        logger.info("Initializing all parameters.")
        ses.run(init_all_op)

    config = tf.ConfigProto(device_filters=["/job:ps", "/job:worker/task:{}/cpu:0".format(args.task)])
    logdir = os.path.join(args.log_dir, 'train')

    if use_tf12_api:
        summary_writer = tf.summary.FileWriter(logdir + "_%d" % args.task)
    else:
        summary_writer = tf.train.SummaryWriter(logdir + "_%d" % args.task)

    logger.info("Events directory: %s_%s", logdir, args.task)
    # The tf.train.Supervisor provides a set of services that helps implement a robust training process. *(4)
    sv = tf.train.Supervisor(is_chief=(args.task == 0),
                             logdir=logdir,
                             saver=saver,
                             summary_op=None,
                             init_op=init_op,
                             init_fn=init_fn,
                             summary_writer=summary_writer,
                             ready_op=tf.report_uninitialized_variables(variables_to_save),
                             global_step=trainer.global_step,
                             save_model_secs=30,
                             save_summaries_secs=30)

    if args.test: # testing phase
        run_test(trainer, sv, config, summary_writer, server)
    else: # training phase
        run_train(trainer, sv, config, summary_writer, server)
Exemple #11
0
def main():
    env = envs.create_env(None)
    act = deepq.load("{}_model.pkl".format(envs.VSTR))

    while True:
        obs, done = env.reset(), False
        episode_rew = 0
        while not done:
            env.render()
            obs, rew, done, _ = env.step(act([obs])[0])
            episode_rew += rew
        print("Episode reward", episode_rew)
Exemple #12
0
def play(args, server):
    env = create_env(args.env_id,
                     client_id=str(args.task),
                     remotes=args.remotes)
    trainer = A3C(env, args.task, args.visualise)
    result = []
    """
    implement your code here 
    Condition:
        The purpose of this function is for testing
        The number of episodes is 20
        you have to return the mean value of rewards of 20 episodes
    """

    return np.mean(result)
Exemple #13
0
def main(env, snapshot, visualise):
    env = create_env(env, client_id=0, remotes=1)
    with tf.variable_scope("global"):
        policy = LSTMPolicy(env.observation_space.shape, env.action_space.n)

    last_state = env.reset()
    # state = last_state
    last_features = policy.get_initial_features()
    length = 0
    rewards = 0
    variables_to_save = [
        v for v in tf.global_variables() if not v.name.startswith("local")
    ]
    saver = tf.train.Saver(variables_to_save)
    with tf.Session() as sess:
        # Restore variables from disk.
        # saver.restore(sess, "train/model.ckpt-361814.data-00000-of-00001")
        # saver.restore(sess, "train/model.ckpt-361814")
        # saver.restore(sess, "/tmp/neonrace/train/model.ckpt-361714")
        saver.restore(sess, snapshot)
        while True:
            terminal_end = False

            fetched = policy.act(last_state, *last_features)
            action, value_, features = fetched[0], fetched[1], fetched[2:]
            # state, reward, terminal, info = env.step(action.argmax())
            action_n = action.argmax()

            # state, reward, terminal, info = env.step(default_action)
            state, reward, terminal, info = env.step(action_n)
            if visualise:
                env.render()
            # env.render() # I need to visualize it during testing
            print 'length: %d, rewards: %f' % (length, rewards)

            length += 1
            rewards += reward

            last_state = state
            last_features = features

            if terminal:
                terminal_end = True
                print("Episode finished. Sum of rewards: %d. Length: %d" %
                      (rewards, length))
                length = 0
                rewards = 0
                break
Exemple #14
0
def main():
    env = envs.create_env(None)
    model = models.mlp([64])
    act = simple.learn(
        env,
        q_func=model,
        lr=1e-3,
        max_timesteps=100000,
        buffer_size=50000,
        exploration_fraction=0.01,
        exploration_final_eps=0.0,
        print_freq=10,
        callback=callback,
        prioritized_replay=True
    )
    print("Saving model to {}_model.pkl".format(envs.VSTR))
    act.save("{}_model.pkl".format(envs.VSTR))
Exemple #15
0
def train(num_workers, env_name="PongDeterministic-v3"):
    env = create_env(env_name, None, None)
    policy = LSTMPolicy(env.observation_space.shape, env.action_space.n, 0)
    agents = [Runner(env_name, i) for i in range(num_workers)]
    parameters = policy.get_weights()
    gradient_list = [agent.compute_gradient(parameters) for agent in agents]
    steps = 0
    obs = 0
    while True:
        done_id, gradient_list = ray.wait(gradient_list)
        gradient, info = ray.get(done_id)[0]
        policy.model_update(gradient)
        parameters = policy.get_weights()
        steps += 1
        obs += info["size"]
        gradient_list.extend([agents[info["id"]].compute_gradient(parameters)])
    return policy
Exemple #16
0
def train(num_workers, env_name="PongDeterministic-v3"):
    env = create_env(env_name)
    ps = ParameterServer(env)
    parameters = ps.get_weights()
    agents = [Runner.remote(env_name, i) for i in range(num_workers)]
    delta_list = [agent.get_delta.remote(parameters) for agent in agents]
    steps = 0
    obs = 0
    timing = []
    for i in range(2000):
        done_id, delta_list = ray.wait(delta_list)
        delta, info = ray.get(done_id)[0]
        ps.add_delta(delta)
        parameters = ps.weights
        obs += info["size"]
        delta_list.extend(
            [agents[info["id"]].compute_gradient.remote(parameters)])
    return policy
Exemple #17
0
def evaluate(constants):
    env, action_space = create_env(constants.env)
    agent = A3CAgent(action_space)
    worker = Worker(env.proxy, agent, GLOBAL_SCOPE, constants)
    worker.build_rollout()

    sess = tf.Session()

    sess.run(tf.global_variables_initializer())
    sess.run(tf.local_variables_initializer())

    # TODO: Model loading for A2C.
    if FLAGS.load_model:
        vars_to_save_load = tf.trainable_variables(GLOBAL_SCOPE)
        saver = tf.train.Saver(vars_to_save_load)
        worker.load_model(sess, saver, constants.model_directory)
    worker.evaluate(sess)
    env.close()
    sess.close()
Exemple #18
0
def async_train(args, make_model, train):
    setproctitle('{}:main'.format(args.name))

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    env = create_env(args.game_type, args.env_name, 'main', 1)
    shared_model = make_model(env.observation_space.shape[0],
                              env.action_space.n)
    shared_model.share_memory()

    if args.no_shared_optimizer:
        optimizer = None
    else:
        optimizer = SharedAdam(shared_model.parameters(), lr=args.lr)
        optimizer.share_memory()

    mp.set_start_method('spawn')
    global_steps = mp.Value('L', 0)

    processes = []
    processes.append(
        mp.Process(target=test, args=(shared_model, global_steps, args)))
    if not args.no_render:
        processes.append(
            mp.Process(target=show, args=(shared_model, global_steps, args)))

    for rank in range(args.n_processes):
        processes.append(
            mp.Process(target=train,
                       args=(shared_model, optimizer, rank, global_steps,
                             args)))

    for p in processes:
        p.start()
        time.sleep(0.1)

    for p in processes:
        p.join()

    print('Main process finished !!!')
Exemple #19
0
def test_penalty_env(env):
    import envs
    env = envs.create_env("Pong",
                          location="bottom",
                          catastrophe_type="1",
                          classifier_file=save_classifier_path +
                          '/0/final.ckpt')

    import matplotlib.pyplot as plt

    observation = env.reset()

    for _ in range(20):
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        plt.imshow(observation[:, :, 0])
        plt.show()
        print('Cat: ', info['frame/is_catastrophe'])
        print('reward: ', reward)
        if done:
            break
Exemple #20
0
def main():
    args = TrainOptions().parse()
    device = torch.device('cuda') if (not args.no_cuda and torch.cuda.is_available()) else torch.device('cpu')
    if not os.path.exists(args.output):
        os.makedirs(args.output)

    env = create_env(args)

    network = create_network(args, env.action_space.n, env.observation_space.shape)
    network.to(device)
    optimizer = Adam(network.parameters(), lr=args.lr)

    policy = AnnealedEpsilonGreedyPolicy(epsilon_max=args.epsilon_max,
                                         epsilon_min=args.epsilon_min, exploration_steps=args.exp_steps)
    memory = SimpleExperienceReplay(max_size=args.mem_max, batch_size=args.batch_size)
    logger = Logger()

    agent = create_agent(args, env, network, policy, memory, optimizer, logger)

    # train agent
    agent.learn(n_episodes=args.n_ep, ep_max_step=args.ep_max_step, replay_start_size=args.replay_start,
                save_every=args.freq_save_model, update_target_every=args.freq_target_update, render_every=args.freq_render)
Exemple #21
0
def main(args):
    env_id = args.env_id
    max_episodes = args.max_episodes
    ckpt_dir = args.ckpt_dir
    output_dir = args.output_dir

    # env
    env = create_env(env_id, 0, 1)
    if len(output_dir) > 0:
        env = wrappers.Monitor(env, output_dir)
    if args.render:
        env.render()

    # work-around to the nasty env.render() failing issue when working with tensorflow
    # see https://github.com/openai/gym/issues/418
    import tensorflow as tf
    from model import Convx2LSTMActorCritic

    # model
    sess = tf.Session()
    with tf.variable_scope("global"):
        network = Convx2LSTMActorCritic(env.observation_space.shape, env.action_space.n)
    init = tf.global_variables_initializer()
    sess.run(init)

    # load model parameters
    checkpoint = tf.train.get_checkpoint_state(ckpt_dir)
    if checkpoint and checkpoint.model_checkpoint_path:
        saver = tf.train.Saver()
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("checkpoint loaded:", checkpoint.model_checkpoint_path)
    else:
        raise Exception('cannot find checkpoint path')

    # run evaluating
    with sess.as_default():
        evaluate_loop(env, network, max_episodes, args)
Exemple #22
0
def evaluate_main(env_id,
                  model_id,
                  max_episodes,
                  ckpt_dir,
                  output_dir,
                  sleep_time,
                  render,
                  verbose,
                  with_global_step=False):
    # env
    env = create_env(env_id, 0, 1)
    if len(output_dir) > 0:  # output recording
        env = wrappers.Monitor(env, output_dir)
    if render:
        env.render()

    is_obs_tuple = isinstance(env.observation_space, Tuple)
    observation_shape = [sp.shape for sp in env.observation_space.spaces] if is_obs_tuple \
        else [env.observation_space.shape]
    action_shape = [env.action_space.n] if isinstance(env.action_space, spaces.Discrete) \
        else [env.action_space.shape[0]]

    # work-around to the nasty env.render() failing issue when working with tensorflow
    # see https://github.com/openai/gym/issues/418
    import tensorflow as tf
    from model import create_model
    use_tf_0_12_api = distutils.version.LooseVersion(tf.VERSION) >= distutils.version.LooseVersion('0.12.0') and \
                    distutils.version.LooseVersion(tf.VERSION) <= distutils.version.LooseVersion('0.12.1')
    use_tf_1_1_api = distutils.version.LooseVersion(
        tf.VERSION) == distutils.version.LooseVersion('1.1.0')

    # model
    tf.reset_default_graph()
    sess = tf.Session()
    with tf.variable_scope("global"):
        network = create_model(model_id, *observation_shape + action_shape)
        if (with_global_step):
            global_step = tf.get_variable("global_step", [],
                                          tf.int32,
                                          initializer=tf.constant_initializer(
                                              0, dtype=tf.int32),
                                          trainable=False)

    init = tf.global_variables_initializer()
    sess.run(init)

    # load model parameters
    checkpoint = tf.train.get_checkpoint_state(ckpt_dir)
    if checkpoint and checkpoint.model_checkpoint_path:

        restore_tf_0_12_model = False
        restore_tf_1_1_model = False

        reader = tf.train.NewCheckpointReader(checkpoint.model_checkpoint_path)
        for var_name in reader.get_variable_to_shape_map():
            if 'RNN/BasicLSTMCell/Linear' in var_name:
                restore_tf_0_12_model = True
                break
            elif 'rnn/basic_lstm_cell/' in var_name:
                restore_tf_1_1_model = True
                break

        if use_tf_1_1_api and restore_tf_0_12_model:
            var_dict = {}
            for var in tf.global_variables():
                name = var.name.split(':')[0]
                if 'rnn/basic_lstm_cell/weights' in name:
                    name = name.replace('rnn/basic_lstm_cell/weights',
                                        'RNN/BasicLSTMCell/Linear/Matrix')
                elif 'rnn/basic_lstm_cell/biases' in name:
                    name = name.replace('rnn/basic_lstm_cell/biases',
                                        'RNN/BasicLSTMCell/Linear/Bias')
                var_dict[name] = var
            saver = tf.train.Saver(var_dict)
        elif use_tf_0_12_api and restore_tf_1_1_model:
            var_dict = {}
            for var in tf.global_variables():
                name = var.name.split(':')[0]
                if 'RNN/BasicLSTMCell/Linear/Matrix' in name:
                    name = name.replace('RNN/BasicLSTMCell/Linear/Matrix',
                                        'rnn/basic_lstm_cell/weights')
                elif 'RNN/BasicLSTMCell/Linear/Bias' in name:
                    name = name.replace('RNN/BasicLSTMCell/Linear/Bias',
                                        'rnn/basic_lstm_cell/biases')
                var_dict[name] = var
            saver = tf.train.Saver(var_dict)
        else:
            saver = tf.train.Saver()

        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("checkpoint loaded:", checkpoint.model_checkpoint_path)
    else:
        raise Exception('cannot find checkpoint path')

    # run evaluating
    with sess.as_default():
        ret = evaluate_loop(env, network, max_episodes, sleep_time, render,
                            verbose)
        env.close()
        if (with_global_step):
            global_step_result = sess.run(global_step)
    sess.close()

    if (with_global_step):
        return ret, global_step_result
    else:
        return ret
Exemple #23
0
def generate_data(rank, args, start, end):

    from envs import create_env, set_seed, get_obs
    from model import R_Module
    import torch

    print(rank, "started")

    env = create_env(args.env_name, framework=args.framework, args=args)
    env = set_seed(args.seed + rank, env, args.framework)
    state = get_obs(env, args.framework)

    if args.from_policy is not None:
        model_state, r_args = torch.load(args.from_policy)
        policy = R_Module(env.action_space.shape[0],
                          r_args.dim,
                          discrete=r_args.discrete,
                          baseline=r_args.baseline,
                          state_space=env.observation_space.shape[0])
        policy.load_state_dict(model_state)
        policy.eval()

    states = []
    actions = []
    i = start

    done = False

    while i < end:
        if i % 100 == 0:
            print(rank, i)
        ep_states = []
        ep_actions = []
        if args.from_policy is not None:
            cx_p = Variable(torch.zeros(1, r_args.dim))
            hx_p = Variable(torch.zeros(1, r_args.dim))
        for j in range(args.rollout):
            if args.from_policy is not None:
                value, logit, (hx_p, cx_p) = policy(state.unsqueeze(0),
                                                    (hx_p, cx_p))
                a, _, _ = get_action(logit, r_args.discrete)
            else:
                a = env.action_space.sample()
            ep_actions.append(a)

            state = get_obs(env, args.framework)
            env.step(a)

            if args.render:
                env.render()

            ep_states.append(state)

        final_state = get_obs(env, args.framework)
        ep_states.append(final_state)
        states.append(ep_states)
        actions.append(ep_actions)
        i += 1

        # reset the environment here
        if done or args.reset:
            env.reset()
            done = False

    torch.save((states, actions),
               os.path.join(args.out_dir,
                            'states_actions_%s_%s.pt' % (start, end)))
Exemple #24
0
    import torch.multiprocessing as mp
    mp.set_start_method('spawn')

    from torch.autograd import Variable
    from envs import create_env, set_seed, get_obs
    from model import R_Module
    os.environ['OMP_NUM_THREADS'] = '1'

    args = parser.parse_args()
    env_name = args.env_name
    env_name += '_rollout%s' % args.rollout
    if args.env_name.endswith('MazeEnv'):
        env_name += 'mazeid%slength%s' % (args.maze_id, args.maze_length)
        if args.single_env and args.maze_id == -1:
            env = create_env(args.env_name,
                             framework=args.framework,
                             args=args)
            env_name += '_single_env'
            args.maze_structure = env._env.MAZE_STRUCTURE
        if args.random_start:
            env_name += '_randomstart'
    if args.file_path is not None:
        env_name += '_transfer'
    if args.framework == 'mazebase':
        env_name += '_rollout_%s_length_%s' % (args.rollout, args.maze_length)
    args.out_dir = os.path.join(args.out, env_name)
    print(args)
    print(args.out_dir)
    os.makedirs(args.out_dir, exist_ok=True)
    processes = []
    block = int(args.N / args.num_processes)
Exemple #25
0
def run(args, server):
    env = create_env(args.env_id, client_id=str(args.task), remotes=args.remotes, envWrap=args.envWrap, designHead=args.designHead,
                        noLifeReward=args.noLifeReward)
    trainer = A3C(env, args.task, args.visualise, args.unsup, args.envWrap, args.designHead, args.noReward)

    # logging
    if args.task == 0:
        with open(args.log_dir + '/log.txt', 'w') as fid:
            for key, val in constants.items():
                fid.write('%s: %s\n'%(str(key), str(val)))
            fid.write('designHead: %s\n'%args.designHead)
            fid.write('input observation: %s\n'%str(env.observation_space.shape))
            fid.write('env name: %s\n'%str(env.spec.id))
            fid.write('unsup method type: %s\n'%str(args.unsup))

    # Variable names that start with "local" are not saved in checkpoints.
    if use_tf12_api:
        variables_to_save = [v for v in tf.global_variables() if not v.name.startswith("local")]
        init_op = tf.variables_initializer(variables_to_save)
        init_all_op = tf.global_variables_initializer()
    else:
        variables_to_save = [v for v in tf.all_variables() if not v.name.startswith("local")]
        init_op = tf.initialize_variables(variables_to_save)
        init_all_op = tf.initialize_all_variables()
    saver = FastSaver(variables_to_save)
    if args.pretrain is not None:
        variables_to_restore = [v for v in tf.trainable_variables() if not v.name.startswith("local")]
        pretrain_saver = FastSaver(variables_to_restore)

    var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)
    logger.info('Trainable vars:')
    for v in var_list:
        logger.info('  %s %s', v.name, v.get_shape())

    def init_fn(ses):
        logger.info("Initializing all parameters.")
        ses.run(init_all_op)
        if args.pretrain is not None:
            pretrain = tf.train.latest_checkpoint(args.pretrain)
            logger.info("==> Restoring from given pretrained checkpoint.")
            logger.info("    Pretraining address: %s", pretrain)
            pretrain_saver.restore(ses, pretrain)
            logger.info("==> Done restoring model! Restored %d variables.", len(variables_to_restore))

    config = tf.ConfigProto(device_filters=["/job:ps", "/job:worker/task:{}/cpu:0".format(args.task)])
    logdir = os.path.join(args.log_dir, 'train')

    if use_tf12_api:
        summary_writer = tf.summary.FileWriter(logdir + "_%d" % args.task)
    else:
        summary_writer = tf.train.SummaryWriter(logdir + "_%d" % args.task)

    logger.info("Events directory: %s_%s", logdir, args.task)
    sv = tf.train.Supervisor(is_chief=(args.task == 0),
                             logdir=logdir,
                             saver=saver,
                             summary_op=None,
                             init_op=init_op,
                             init_fn=init_fn,
                             summary_writer=summary_writer,
                             ready_op=tf.report_uninitialized_variables(variables_to_save),
                             global_step=trainer.global_step,
                             save_model_secs=30,
                             save_summaries_secs=30)

    num_global_steps = constants['MAX_GLOBAL_STEPS']

    logger.info(
        "Starting session. If this hangs, we're mostly likely waiting to connect to the parameter server. " +
        "One common cause is that the parameter server DNS name isn't resolving yet, or is misspecified.")
    with sv.managed_session(server.target, config=config) as sess, sess.as_default():
        # Workaround for FailedPreconditionError
        # see: https://github.com/openai/universe-starter-agent/issues/44 and 31
        sess.run(trainer.sync)

        trainer.start(sess, summary_writer)
        global_step = sess.run(trainer.global_step)
        logger.info("Starting training at gobal_step=%d", global_step)
        while not sv.should_stop() and (not num_global_steps or global_step < num_global_steps):
            trainer.process(sess)
            global_step = sess.run(trainer.global_step)

    # Ask for all the services to stop.
    sv.stop()
    logger.info('reached %s steps. worker stopped.', global_step)
def inference(args):
    """
    It only restores LSTMPolicy architecture, and does inference using that.
    """
    # get address of checkpoints
    indir = os.path.join(args.log_dir, 'train')
    outdir = os.path.join(
        args.log_dir, 'inference') if args.out_dir is None else args.out_dir
    with open(indir + '/checkpoint', 'r') as f:
        first_line = f.readline().strip()
    ckpt = first_line.split(' ')[-1].split('/')[-1][:-1]
    ckpt = ckpt.split('-')[-1]
    ckpt = indir + '/model.ckpt-' + ckpt

    # define environment
    if args.record:
        env = create_env(args.env_id,
                         client_id='0',
                         remotes=None,
                         envWrap=args.envWrap,
                         designHead=args.designHead,
                         record=True,
                         noop=args.noop,
                         acRepeat=args.acRepeat,
                         outdir=outdir)
    else:
        env = create_env(args.env_id,
                         client_id='0',
                         remotes=None,
                         envWrap=args.envWrap,
                         designHead=args.designHead,
                         record=True,
                         noop=args.noop,
                         acRepeat=args.acRepeat)
    numaction = env.action_space.n

    with tf.device("/cpu:0"):
        # define policy network
        with tf.variable_scope("global"):
            policy = LSTMPolicy(env.observation_space.shape, numaction,
                                args.designHead)
            policy.global_step = tf.get_variable(
                "global_step", [],
                tf.int32,
                initializer=tf.constant_initializer(0, dtype=tf.int32),
                trainable=False)

        # Variable names that start with "local" are not saved in checkpoints.
        if use_tf12_api:
            variables_to_restore = [
                v for v in tf.global_variables()
                if not v.name.startswith("local")
            ]
            init_all_op = tf.global_variables_initializer()
        else:
            variables_to_restore = [
                v for v in tf.all_variables() if not v.name.startswith("local")
            ]
            init_all_op = tf.initialize_all_variables()
        saver = FastSaver(variables_to_restore)

        # print trainable variables
        var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                     tf.get_variable_scope().name)
        logger.info('Trainable vars:')
        for v in var_list:
            logger.info('  %s %s', v.name, v.get_shape())

        # summary of rewards
        action_writers = []
        if use_tf12_api:
            summary_writer = tf.summary.FileWriter(outdir)
            for ac_id in range(numaction):
                action_writers.append(
                    tf.summary.FileWriter(
                        os.path.join(outdir, 'action_{}'.format(ac_id))))
        else:
            summary_writer = tf.train.SummaryWriter(outdir)
            for ac_id in range(numaction):
                action_writers.append(
                    tf.train.SummaryWriter(
                        os.path.join(outdir, 'action_{}'.format(ac_id))))
        logger.info("Inference events directory: %s", outdir)

        config = tf.ConfigProto(allow_soft_placement=True,
                                log_device_placement=False)
        with tf.Session(config=config) as sess:
            logger.info("Initializing all parameters.")
            sess.run(init_all_op)
            logger.info("Restoring trainable global parameters.")
            saver.restore(sess, ckpt)
            logger.info("Restored model was trained for %.2fM global steps",
                        sess.run(policy.global_step) / 1000000.)
            #saving with meta graph:
            metaSaver = tf.train.Saver(variables_to_restore)
            metaSaver.save(
                sess, '/home/swagking0/noreward-rl/models/models_me/mario_me')

            last_state = env.reset()
            if args.render or args.record:
                env.render()
            last_features = policy.get_initial_features()  # reset lstm memory
            length = 0
            rewards = 0
            mario_distances = np.zeros((args.num_episodes, ))
            for i in range(args.num_episodes):
                print("Starting episode %d" % (i + 1))
                if args.recordSignal:
                    from PIL import Image
                    signalCount = 1
                    utils.mkdir_p(outdir + '/recordedSignal/ep_%02d/' % i)
                    Image.fromarray(
                        (255 * last_state[..., -1]).astype('uint8')).save(
                            outdir + '/recordedSignal/ep_%02d/%06d.jpg' %
                            (i, signalCount))

                if args.random:
                    print('I am random policy!')
                else:
                    if args.greedy:
                        print('I am greedy policy!')
                    else:
                        print('I am sampled policy!')
                while True:
                    # run policy
                    fetched = policy.act_inference(last_state, *last_features)
                    prob_action, action, value_, features = fetched[
                        0], fetched[1], fetched[2], fetched[3:]

                    # run environment: sampled one-hot 'action' (not greedy)
                    if args.random:
                        stepAct = np.random.randint(0,
                                                    numaction)  # random policy
                    else:
                        if args.greedy:
                            stepAct = prob_action.argmax()  # greedy policy
                        else:
                            stepAct = action.argmax()
                    # print(stepAct, prob_action.argmax(), prob_action)
                    state, reward, terminal, info = env.step(stepAct)

                    # update stats
                    length += 1
                    rewards += reward
                    last_state = state
                    last_features = features
                    if args.render or args.record:
                        env.render()
                    if args.recordSignal:
                        signalCount += 1
                        Image.fromarray(
                            (255 * last_state[..., -1]).astype('uint8')).save(
                                outdir + '/recordedSignal/ep_%02d/%06d.jpg' %
                                (i, signalCount))

                    # store summary
                    summary = tf.Summary()
                    summary.value.add(tag='ep_{}/reward'.format(i),
                                      simple_value=reward)
                    summary.value.add(tag='ep_{}/netreward'.format(i),
                                      simple_value=rewards)
                    summary.value.add(tag='ep_{}/value'.format(i),
                                      simple_value=float(value_[0]))
                    if 'NoFrameskip-v' in args.env_id:  # atari
                        summary.value.add(
                            tag='ep_{}/lives'.format(i),
                            simple_value=env.unwrapped.ale.lives())
                    summary_writer.add_summary(summary, length)
                    summary_writer.flush()
                    summary = tf.Summary()
                    for ac_id in range(numaction):
                        summary.value.add(tag='action_prob',
                                          simple_value=float(
                                              prob_action[ac_id]))
                        action_writers[ac_id].add_summary(summary, length)
                        action_writers[ac_id].flush()

                    timestep_limit = env.spec.tags.get(
                        'wrapper_config.TimeLimit.max_episode_steps')
                    if timestep_limit is None:
                        timestep_limit = env.spec.timestep_limit
                    if terminal or length >= timestep_limit:
                        if length >= timestep_limit or not env.metadata.get(
                                'semantics.autoreset'):
                            last_state = env.reset()
                        last_features = policy.get_initial_features(
                        )  # reset lstm memory
                        print(
                            "Episode finished. Sum of rewards: %.2f. Length: %d."
                            % (rewards, length))
                        if 'distance' in info:
                            print('Mario Distance Covered:', info['distance'])
                            mario_distances[i] = info['distance']
                        length = 0
                        rewards = 0
                        if args.render or args.record:
                            env.render()
                        if args.recordSignal:
                            signalCount += 1
                            Image.fromarray(
                                (255 *
                                 last_state[..., -1]).astype('uint8')).save(
                                     outdir +
                                     '/recordedSignal/ep_%02d/%06d.jpg' %
                                     (i, signalCount))
                        break

        logger.info('Finished %d true episodes.', args.num_episodes)
        if 'distance' in info:
            print('Mario Distances:', mario_distances)
            np.save(outdir + '/distances.npy', mario_distances)
        env.close()
Exemple #27
0
def train(shared_model, shared_optimizer, rank, args, info):
    env = create_env()  # make a local (unshared) environment

    torch.manual_seed(args.seed + rank)  # seed everything
    model = NNPolicy(channels=1,
                     memsize=args.hidden,
                     num_actions=args.num_actions).to(
                         device=args.device)  # a local/unshared model
    state = torch.tensor(prepro(env.reset())).to(
        device=args.device)  # get first state

    start_time = last_disp_time = time.time()
    episode_length, epr, eploss, done = 0, 0, 0, True  # bookkeeping

    steps_done = 1

    while info['frames'][
            0] <= 8e8 or args.test:  # openai baselines uses 40M frames...we'll use 80M
        model.load_state_dict(
            shared_model.state_dict())  # sync with shared model

        # hx = torch.zeros(1, args.hidden) if done else hx.detach()  # rnn activation vector
        # hx = torch.randn(1, args.hidden) if done else hx.detach()
        hx = torch.randn(1, args.hidden) if done else hx.detach()
        values, logps, actions, rewards = [], [], [], [
        ]  # save values for computing gradientss

        for step in range(args.rnn_steps):
            # for step in range(0, np.random.randint(10, 40)):
            episode_length += 1
            value, logit, hx = model(
                (state.view(1, 1, 160, 160), hx.to(device=args.device)))
            logp = F.log_softmax(logit, dim=-1)

            action = torch.exp(logp).multinomial(
                num_samples=1).data[0]  #logp.max(1)[1].data if args.test else
            state, reward, done = env.step(
                action.cpu().item())  # action.cpu().numpy()[0]

            state = torch.tensor(prepro(state)).to(args.device)
            epr += reward
            # reward = np.clip(reward, -1, 1) # reward
            done = done or episode_length >= 1e4  # don't playing one ep for too long

            info['frames'].add_(1)
            num_frames = int(info['frames'].item())
            if num_frames % 1e6 == 0:  # save every 2M frames
                torch.save(
                    shared_model.state_dict(), args.save_dir +
                    'model.{:.0f}.tar'.format(num_frames / 1e6))
                printlog(
                    args, '\n\t{:.0f}M frames: saved model\n'.format(
                        num_frames / 1e6))

            if done:  # update shared data
                info['episodes'] += 1
                interp = 1 if info['episodes'][0] == 1 else 1 - args.horizon
                info['run_epr'].mul_(1 - interp).add_(interp * epr)
                info['run_loss'].mul_(1 - interp).add_(interp * eploss)

            if rank == 0 and time.time(
            ) - last_disp_time > 60:  # print info ~ every minute
                elapsed = time.strftime("%Hh %Mm %Ss",
                                        time.gmtime(time.time() - start_time))
                printlog2(
                    args, info['run_epr'].item(), info['run_loss'].item(),
                    num_frames,
                    'time {}, episodes {:.0f}, frames {:.1f}M, mean epr {:.2f}, run loss {:.2f}'
                    .format(elapsed, info['episodes'].item(), num_frames / 1e6,
                            info['run_epr'].item(), info['run_loss'].item()))

                last_disp_time = time.time()

            if done:  # maybe print info.
                # reward = 500

                episode_length, epr, eploss = 0, 0, 0
                state = torch.tensor(prepro(env.reset())).to(args.device)

            values.append(value)
            logps.append(logp)
            actions.append(action)
            rewards.append(reward)

        next_value = torch.zeros(1, 1).to(
            device=args.device) if done else model(
                (state.view(1, 1, 160, 160), hx))[0]
        values.append(next_value.detach())

        re = np.asarray(rewards)  # + 1
        _n = LA.norm(re)
        # print(np.nan_to_num((re / _n)))

        # loss = cost_func(args, torch.cat(values).cpu(), torch.cat(logps).cpu(), torch.cat(actions).cpu(), torch.from_numpy(np.asarray(rewards)))
        # loss = cost_func(args, torch.cat(values).cpu(), torch.cat(logps).cpu(), torch.cat(actions).cpu(), torch.from_numpy(np.asarray(rewards)).float().to(args.device))
        loss = cost_func(args,
                         torch.cat(values).cpu(),
                         torch.cat(logps).cpu(),
                         torch.cat(actions).cpu(),
                         torch.from_numpy(np.nan_to_num((re / _n))).cpu())
        eploss += loss.item()
        shared_optimizer.zero_grad()
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 40)

        for param, shared_param in zip(model.parameters(),
                                       shared_model.parameters()):
            if shared_param.grad is None:
                shared_param._grad = param.grad  # sync gradients with shared model
        shared_optimizer.step()
Exemple #28
0
def run(args, server):
    env = create_env(args.env_id, client_id=str(args.task), remotes=args.remotes)
    if args.teacher:
        teacher = model.LSTMPolicy(env.observation_space.shape, env.action_space.n, name="global")
        teacher_init_op = teacher.load_model_from_checkpoint(args.checkpoint_path)

        trainer = A3C(env, args.task, args.visualise, teacher= teacher, name="student")

    else:
        teacher = None
        trainer = A3C(env, args.task, args.visualise, teacher= teacher)

    # Variable names that start with "local" are not saved in checkpoints.
    if use_tf12_api:
        variables_to_save = trainer.global_var_list
        all_trainable_variables = [v for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) if trainer.scope in v.name]
        init_op = tf.variables_initializer(variables_to_save)
        init_all_op = tf.variables_initializer(all_student_variables)

    else:

        variables_to_save = trainer.global_var_list
        init_op = tf.initialize_variables(variables_to_save)
        all_trainable_variables = [v for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) if trainer.scope in v.name]
        init_all_op = tf.variables_initializer(all_student_variables)

    saver = FastSaver(variables_to_save)

    logger.info('Trainable vars:')

    for v in all_trainable_variables:
        logger.info('{} {}'.format(v.name, v.get_shape()))

    def init_fn(ses):
        logger.info("Initializing all parameters.")
        ses.run([init_all_op])

    def get_init_fn():
        if args.teacher:
            return tf.contrib.framework.assign_from_checkpoint_fn(
                    args.checkpoint_path,
                    teacher.var_list,
                    ignore_missing_vars=True)
        else:
            return lambda sess: init_fn(sess)

    config = tf.ConfigProto(device_filters=["/job:ps", "/job:worker/task:{}/cpu:0".format(args.task)])
    logdir = os.path.join(args.log_dir, 'train')

    if use_tf12_api:
        summary_writer = tf.summary.FileWriter(logdir + "_{}".format(args.task))
    else:
        summary_writer = tf.train.SummaryWriter(logdir + "_'{}".format(args.task))

    logger.info("Events directory: {}_{}".format(logdir, args.task))

    sv = tf.train.Supervisor(is_chief=(args.task == 0),
                             logdir=logdir,
                             saver=saver,
                             summary_op=None,
                             init_op=init_op,
                             init_fn=get_init_fn(),
                             summary_writer=summary_writer,
                             ready_op=tf.report_uninitialized_variables(variables_to_save),
                             global_step=trainer.global_step,
                             save_model_secs=30,
                             save_summaries_secs=30)

    num_global_steps = 100000000

    logger.info(
        "Starting session. If this hangs, we're mostly likely waiting to connect to the parameter server. " +
        "One common cause is that the parameter server DNS name isn't resolving yet, or is misspecified.")
    with sv.managed_session(server.target, config=config) as sess, sess.as_default():
        sess.run(trainer.sync)
        trainer.start(sess, summary_writer)
        global_step = sess.run(trainer.global_step)
        logger.info("Starting training at step={}".format(global_step))
        while not sv.should_stop() and (not num_global_steps or global_step < num_global_steps):
            trainer.process(sess)
            global_step = sess.run(trainer.global_step)

    # Ask for all the services to stop.
    sv.stop()
    logger.info('reached {} steps. worker stopped.'.format(global_step))
Exemple #29
0
def run(env_name, version, act_rep, max_steps, rollout_agent_name,
        behavior_agent_name, eps_greedy, sim_steps, search_horizont,
        gamma=1., exploration=1., prune_tree=False, report_freq=100,
        n_runs=1, save_dir=None, save_freq=10, process=0):

    def save_data():
        if save_dir is not None and len(frames) > 0:
            run_data = {
                'frames': frames,
                'actions': actions,
                'reward': total_reward,
                'action_visits': action_visits,
                'action_values': action_values,
                'rewards': rewards,
                'action_meanings': env.env.get_action_meanings(),
            }
            fname = os.path.join(save_dir, 'run_process_{}_run_{}_steps_{}.pkl'.format(process, n_run, step))
            with open(fname, 'wb') as f:
                cPickle.dump(run_data, f, -1)

            del actions[:]
            del frames[:]
            del action_visits[:]
            del action_values[:]
            del rewards[:]

    env = create_env(env_name, version, act_rep)
    uct.Node.n_actions = env.action_space.n

    # agent for rollouts
    if rollout_agent_name == 'random' or rollout_agent_name is None:
        rollout_agent = RandomAgent(env.action_space.n)
    else:
        rollout_agent = KerasAgent(rollout_agent_name)

    # agent for action selections
    if behavior_agent_name == 'random':
        behavior_agent = RandomAgent(env.action_space.n)
    elif behavior_agent_name == 'uct' or behavior_agent_name is None:
        behavior_agent = 'uct'
    else:
        behavior_agent = KerasAgent(behavior_agent_name)

    if save_dir is not None:
        actions = []
        frames = []
        action_visits = []
        action_values = []
        rewards = []

    for n_run in xrange(n_runs):
        terminal = False

        env.reset()
        _frame = env.env._get_image()

        node = uct.Node(env.clone_state())

        total_reward = 0
        step = 0
        t_start = t0 = time()
        while not terminal:
            # choose uct action
            a_uct = uct.uct_action(env, rollout_agent, node, sim_steps, search_horizont, gamma, exploration)

            # choose action in environment
            if np.random.rand() < eps_greedy:
                a = env.action_space.sample()
            elif behavior_agent == 'uct':
                a = a_uct
            else:
                a = behavior_agent.choose_action(_frame)

            if save_dir is not None:
                actions.append(a_uct)
                frames.append(_frame)
                action_visits.append(node.a_visits)
                action_values.append(node.a_values)

            # do step in environment
            env.restore_state(node.state)
            frame, reward, terminal, _ = env.step(a)
            _frame = env.env._get_image()

            if save_dir is not None:
                rewards.append(reward)

            # create new tree or try to use old tree
            if prune_tree:
                if frame in node.childs[a]:
                    node = node.childs[a][frame]
                    node.parent = None
                else:
                    node = uct.Node(env.clone_state())
            else:
                node = uct.Node(env.clone_state())

            total_reward += reward
            step += 1

            # report progress
            if step % report_freq == 0:
                print 'process: {} run: {}, steps: {}, time: {:.2f}, total reward: {:.2f}'.\
                    format(process, n_run+1, step, time() - t0, total_reward)
                t0 = time()

            # save intermediate result
            if step % save_freq == 0:
                save_data()

            if 0 < max_steps < step:
                break

        print '\nprocess: {}, run: {}, total steps: {}, total time: {:.2f}, total reward: {:.2f}'.\
            format(process, n_run+1, step, time() - t_start, total_reward)

        # save last chunk of data
        save_data()

    env.close()
Exemple #30
0
def inference(args):
    """
    It restore policy weights, and does inference.
    """
    # virtual display (headless remotes)
    virtual_display = Display(visible=0, size=(1400, 900))
    virtual_display.start()

    # define environment
    env = create_env(args.env_id, client_id='0', remotes=None, envWrap=True,
                        acRepeat=1, record=args.record, outdir=args.outdir)
    num_actions = env.action_space.n

    with tf.device("/cpu:0"):
        config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
        with tf.Session(config=config) as sess:
            logger.info("Restoring trainable global parameters.")
            saver = tf.train.import_meta_graph(args.ckpt+'.meta')
            saver.restore(sess, args.ckpt)

            probs = tf.get_collection("probs")[0]
            sample = tf.get_collection("sample")[0]
            vf = tf.get_collection("vf")[0]
            state_out_0 = tf.get_collection("state_out_0")[0]
            state_out_1 = tf.get_collection("state_out_1")[0]

            last_state = env.reset()
            if args.render or args.record:
                env.render()
            last_features = np.zeros((1, 256), np.float32); last_features = [last_features, last_features]
            length = 0
            rewards = 0
            mario_distances = np.zeros((args.num_episodes,))
            for i in range(args.num_episodes):
                print("Starting episode %d" % (i + 1))
                if args.random:
                    print('I am a random policy!')
                else:
                    if args.greedy:
                        print('I am a greedy policy!')
                    else:
                        print('I am a sampled policy!')
                while True:
                    # run policy
                    fetched = sess.run([probs, sample, vf, state_out_0, state_out_1] ,
                                {"global/x:0": [last_state], "global/c_in:0": last_features[0], "global/h_in:0": last_features[1]})
                    prob_action, action, value_, features = fetched[0], fetched[1], fetched[2], fetched[3:]

                    # run environment
                    if args.random:
                        stepAct = np.random.randint(0, num_actions)  # random policy
                    else:
                        if args.greedy:
                            stepAct = prob_action.argmax()  # greedy policy
                        else:
                            stepAct = action.argmax()
                    state, reward, terminal, info = env.step(stepAct)

                    # update stats
                    length += 1
                    rewards += reward
                    last_state = state
                    last_features = features
                    if args.render or args.record:
                        env.render(mode='rgb_array') # set to rgb_array by default (assumes running on a headless remote)

                    timestep_limit = env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps')
                    if timestep_limit is None: timestep_limit = env.spec.timestep_limit
                    if terminal or length >= timestep_limit:
                        if length >= timestep_limit or not env.metadata.get('semantics.autoreset'):
                            last_state = env.reset()
                        last_features = np.zeros((1, 256), np.float32); last_features = [last_features, last_features]
                        print("Episode finished. Sum of rewards: %.2f. Length: %d." % (rewards, length))
                        length = 0
                        rewards = 0
                        if args.render or args.record:
                            env.render(mode='rgb_array')
                        break

        logger.info('Finished %d true episodes.', args.num_episodes)
        env.close()
Exemple #31
0
parser.add_argument('--no-shared',
                    default=False,
                    help='use an optimizer without shared momentum.')
parser.add_argument('--memsize', type=int, default=256)
parser.add_argument('--device', type=str, default='cuda')

if __name__ == '__main__':
    # os.environ['OMP_NUM_THREADS'] = '1'
    # os.environ['CUDA_VISIBLE_DEVICES'] = "0"

    args = parser.parse_args()
    args.device = torch.device(args.device)

    args.env_name = 'FlappyBird-v0'

    env = create_env()

    shared_model = ActorCritic(1, args.memsize, len(env.action_space))
    shared_model.share_memory()

    if args.no_shared:
        optimizer = None
    else:
        optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=args.lr)
        optimizer.share_memory()

    processes = []

    # counter = mp.Value('i', 0)
    # lock = mp.Lock()