def new_env(args):
    config = open(args.config) if args.config != "" else None
    env = create_env(args.env_id,
        str(args.task), 
        args.remotes,
        config=config)
    return env
Esempio n. 2
0
def show(shared_model, global_steps, args):
    setproctitle('{}:show'.format(args.name))

    try:
        env = create_env(args.game_type, args.env_name, 'show', 1)
        model = copy.deepcopy(shared_model)
        gpu_id = args.gpu_ids[-2]
        with torch.cuda.device(gpu_id):
            model = model.cuda() if gpu_id >= 0 else model
        model.eval()

        while True:
            # Sync with the shared model
            with torch.cuda.device(gpu_id):
                model.load_state_dict(shared_model.state_dict())
            model.reset()
            play_game(env,
                      model,
                      args.max_episode_length,
                      render=True,
                      gpu_id=gpu_id)

            if global_steps.value >= args.max_global_steps:
                break
    except KeyboardInterrupt:
        raise
    finally:
        print('Player Finished !!!')
Esempio n. 3
0
def new_env(args):
    config = args.config
    if isinstance(args.config, str):
        config = open(args.config) if args.config != "" else None
        config = bs4.BeautifulSoup(config, "lxml")

    env = create_env(args.env_id, str(args.task), args.remotes, config=config)
    return env
Esempio n. 4
0
def run(args):
    env = create_env(args.env_id,
                     client_id=str(args.task),
                     remotes=args.remotes)
    trainer = A3C(env, args.task, args.visualise, args.num_workers,
                  args.worker_id, args.verbose_lvl)

    # Variable names that start with "local" are not saved in checkpoints.
    if use_tf12_api:
        variables_to_save = [
            v for v in tf.global_variables() if not v.name.startswith("local")
        ]
        init_op = tf.variables_initializer(variables_to_save)
        init_all_op = tf.global_variables_initializer()
    else:
        variables_to_save = [
            v for v in tf.all_variables() if not v.name.startswith("local")
        ]
        init_op = tf.initialize_variables(variables_to_save)
        init_all_op = tf.initialize_all_variables()
    print variables_to_save
    saver = FastSaver(variables_to_save)

    var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                 tf.get_variable_scope().name)
    logger.info('Trainable vars:')
    for v in var_list:
        logger.info('  %s %s', v.name, v.get_shape())

    def init_fn(ses):
        logger.info("Initializing all parameters.")
        ses.run(init_all_op)

    logdir = os.path.join(args.log_dir, 'train')

    if use_tf12_api:
        summary_writer = tf.summary.FileWriter(logdir + "_%d" % args.task)
    else:
        summary_writer = tf.train.SummaryWriter(logdir + "_%d" % args.task)

    logger.info("Events directory: %s_%s", logdir, args.task)

    logger.info(
        "Starting session. If this hangs, we're mostly likely waiting to connect to the parameter server. "
        +
        "One common cause is that the parameter server DNS name isn't resolving yet, or is misspecified."
    )
    with tf.Session() as sess:
        init = tf.global_variables_initializer()
        sess.run(init)

        trainer.start_listen_thread()
        trainer.sync_initial_weights(sess, var_list)
        trainer.start(sess, summary_writer)
        while True:
            trainer.process(sess)
def run(args):
    env = create_env(args.env_id)
    trainer = A3C(env, None, args.visualise, args.intrinsic_type, args.bptt)

    # Variable names that start with "local" are not saved in checkpoints.
    variables_to_save = [v for v in tf.global_variables() if not v.name.startswith("local")]
    init_op = tf.variables_initializer(variables_to_save)
    init_all_op = tf.global_variables_initializer()
    saver = FastSaver(variables_to_save)

    var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)
    logger.info('Trainable vars:')
    for v in var_list:
        logger.info('  %s %s', v.name, v.get_shape())

    def init_fn(ses):
        logger.info("Initializing all parameters.")
        ses.run(init_all_op)

    logdir = os.path.join(args.log_dir, 'train')
    summary_writer = tf.summary.FileWriter(logdir)
    logger.info("Events directory: %s", logdir)

    sv = tf.train.Supervisor(is_chief=True,
                             logdir=logdir,
                             saver=saver,
                             summary_op=None,
                             init_op=init_op,
                             init_fn=init_fn,
                             summary_writer=summary_writer,
                             ready_op=tf.report_uninitialized_variables(variables_to_save),
                             global_step=None,
                             save_model_secs=0,
                             save_summaries_secs=0)

    video_dir = os.path.join(args.log_dir, 'test_videos_' + args.intrinsic_type)
    if not os.path.exists(video_dir):
        os.makedirs(video_dir)
    video_filename = video_dir + "/%s_%02d_%d.gif"
    print("Video saved at %s" % video_dir)

    with sv.managed_session() as sess, sess.as_default():
        trainer.start(sess, summary_writer)
        rewards = []
        lengths = []
        for i in range(10):
            frames, reward, length = trainer.evaluate(sess)
            rewards.append(reward)
            lengths.append(length)
            imageio.mimsave(video_filename % (args.env_id, i, reward), frames, fps=30)

        print('Evaluation: avg. reward %.2f    avg.length %.2f' %
              (sum(rewards) / 10.0, sum(lengths) / 10.0))

    # Ask for all the services to stop.
    sv.stop()
Esempio n. 6
0
def main():
    args = parse_args()
    env = envs.create_env(args.domain, args.task, args.verbose)
    agent = agents.create_agent(args.model, env, args.verbose)

    if args.train:
        agent.train(env, args.save_model, args.verbose, args.display,
                    args.save_training_curve)
    elif args.eval:
        agent.eval(env, args.verbose, args.display)
Esempio n. 7
0
def run(args, server):
    env = create_env(args.env_id, client_id=str(args.task), remotes=args.remotes)
    trainer = A3C(env, args.task, args.visualise)

    # Variable names that start with "local" are not saved in checkpoints.
    variables_to_save = [v for v in tf.global_variables() if not v.name.startswith("local")]
    init_op = tf.variables_initializer(variables_to_save)
    init_all_op = tf.global_variables_initializer()
    
    saver = FastSaver(variables_to_save)

    var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)
    logger.info('Trainable vars:')
    for v in var_list:
        logger.info('  %s %s', v.name, v.get_shape())

    def init_fn(ses):
        logger.info("Initializing all parameters.")
        ses.run(init_all_op)

    config = tf.ConfigProto(device_filters=["/job:ps", "/job:worker/task:{}/cpu:0".format(args.task)])
    logdir = os.path.join(args.log_dir, 'train')

    summary_writer = tf.summary.FileWriter(logdir + "_%d" % args.task)

    logger.info("Events directory: %s_%s", logdir, args.task)
    sv = tf.train.Supervisor(is_chief=(args.task == 0),
                             logdir=logdir,
                             saver=saver,
                             summary_op=None,
                             init_op=init_op,
                             init_fn=init_fn,
                             summary_writer=summary_writer,
                             ready_op=tf.report_uninitialized_variables(variables_to_save),
                             global_step=trainer.global_step,
                             save_model_secs=30,
                             save_summaries_secs=30)

    num_global_steps = 100000000

    logger.info(
        "Starting session. If this hangs, we're mostly likely waiting to connect to the parameter server. " +
        "One common cause is that the parameter server DNS name isn't resolving yet, or is misspecified.")
    with sv.managed_session(server.target, config=config) as sess, sess.as_default():
        sess.run(trainer.sync)
        trainer.start(sess, summary_writer)
        global_step = sess.run(trainer.global_step)
        logger.info("Starting training at step=%d", global_step)
        while not sv.should_stop() and (not num_global_steps or global_step < num_global_steps):
            trainer.process(sess)
            global_step = sess.run(trainer.global_step)

    # Ask for all the services to stop.
    sv.stop()
    logger.info('reached %s steps. worker stopped.', global_step)
def new_env(args):
    config = open(args.config) if args.config != "" else None
    env = create_env(args.env_id, str(args.task), args.remotes, config=config)
    if args.env_id == 'maze':
        return env

    path = os.getcwd() + '/tmp/vpn_record_1'
    if os.path.exists(path):
        print "removing old directory" + path
    env = wrappers.Monitor(env, path)
    return env
Esempio n. 9
0
 def __init__(self, env_name, actor_id, logdir="results/", start=True):
     env = create_env(env_name)
     self.id = actor_id
     num_actions = env.action_space.n
     self.policy = LSTMPolicy(env.observation_space.shape, num_actions,
                              actor_id)
     self.runner = RunnerThread(env, self.policy, 20)
     self.env = env
     self.logdir = logdir
     if start:
         self.start()
Esempio n. 10
0
def run(args, server):
    env = create_env(args.env_id, client_id=str(args.task), remotes=args.remotes, num_trials=args.num_trials)

    trainer = A3C(env, args.task, args.visualise, args.meta, args.remotes, args.num_trials)

    # log, checkpoints et tensorboard

    # (Original Comment) Variable names that start with "local" are not saved in checkpoints.
    if use_tf12_api:
        variables_to_save = [v for v in tf.global_variables() if not v.name.startswith("local")]
        init_op = tf.variables_initializer(variables_to_save)
        init_all_op = tf.global_variables_initializer()
    else:
        variables_to_save = [v for v in tf.all_variables() if not v.name.startswith("local")]
        init_op = tf.initialize_variables(variables_to_save)
        init_all_op = tf.initialize_all_variables()
    saver = FastSaver(variables_to_save)

    var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)
    logger.info('Trainable vars:')
    for v in var_list:
        logger.info('  %s %s', v.name, v.get_shape())

    def init_fn(ses):
        logger.info("Initializing all parameters.")
        ses.run(init_all_op)

    config = tf.ConfigProto(device_filters=["/job:ps", "/job:worker/task:{}/cpu:0".format(args.task)])
    logdir = os.path.join(args.log_dir, 'train')

    if use_tf12_api:
        summary_writer = tf.summary.FileWriter(logdir + "_%d" % args.task)
    else:
        summary_writer = tf.train.SummaryWriter(logdir + "_%d" % args.task)

    logger.info("Events directory: %s_%s", logdir, args.task)
    # The tf.train.Supervisor provides a set of services that helps implement a robust training process. *(4)
    sv = tf.train.Supervisor(is_chief=(args.task == 0),
                             logdir=logdir,
                             saver=saver,
                             summary_op=None,
                             init_op=init_op,
                             init_fn=init_fn,
                             summary_writer=summary_writer,
                             ready_op=tf.report_uninitialized_variables(variables_to_save),
                             global_step=trainer.global_step,
                             save_model_secs=30,
                             save_summaries_secs=30)

    if args.test: # testing phase
        run_test(trainer, sv, config, summary_writer, server)
    else: # training phase
        run_train(trainer, sv, config, summary_writer, server)
Esempio n. 11
0
def main():
    env = envs.create_env(None)
    act = deepq.load("{}_model.pkl".format(envs.VSTR))

    while True:
        obs, done = env.reset(), False
        episode_rew = 0
        while not done:
            env.render()
            obs, rew, done, _ = env.step(act([obs])[0])
            episode_rew += rew
        print("Episode reward", episode_rew)
Esempio n. 12
0
def play(args, server):
    env = create_env(args.env_id,
                     client_id=str(args.task),
                     remotes=args.remotes)
    trainer = A3C(env, args.task, args.visualise)
    result = []
    """
    implement your code here 
    Condition:
        The purpose of this function is for testing
        The number of episodes is 20
        you have to return the mean value of rewards of 20 episodes
    """

    return np.mean(result)
Esempio n. 13
0
def main(env, snapshot, visualise):
    env = create_env(env, client_id=0, remotes=1)
    with tf.variable_scope("global"):
        policy = LSTMPolicy(env.observation_space.shape, env.action_space.n)

    last_state = env.reset()
    # state = last_state
    last_features = policy.get_initial_features()
    length = 0
    rewards = 0
    variables_to_save = [
        v for v in tf.global_variables() if not v.name.startswith("local")
    ]
    saver = tf.train.Saver(variables_to_save)
    with tf.Session() as sess:
        # Restore variables from disk.
        # saver.restore(sess, "train/model.ckpt-361814.data-00000-of-00001")
        # saver.restore(sess, "train/model.ckpt-361814")
        # saver.restore(sess, "/tmp/neonrace/train/model.ckpt-361714")
        saver.restore(sess, snapshot)
        while True:
            terminal_end = False

            fetched = policy.act(last_state, *last_features)
            action, value_, features = fetched[0], fetched[1], fetched[2:]
            # state, reward, terminal, info = env.step(action.argmax())
            action_n = action.argmax()

            # state, reward, terminal, info = env.step(default_action)
            state, reward, terminal, info = env.step(action_n)
            if visualise:
                env.render()
            # env.render() # I need to visualize it during testing
            print 'length: %d, rewards: %f' % (length, rewards)

            length += 1
            rewards += reward

            last_state = state
            last_features = features

            if terminal:
                terminal_end = True
                print("Episode finished. Sum of rewards: %d. Length: %d" %
                      (rewards, length))
                length = 0
                rewards = 0
                break
Esempio n. 14
0
def main():
    env = envs.create_env(None)
    model = models.mlp([64])
    act = simple.learn(
        env,
        q_func=model,
        lr=1e-3,
        max_timesteps=100000,
        buffer_size=50000,
        exploration_fraction=0.01,
        exploration_final_eps=0.0,
        print_freq=10,
        callback=callback,
        prioritized_replay=True
    )
    print("Saving model to {}_model.pkl".format(envs.VSTR))
    act.save("{}_model.pkl".format(envs.VSTR))
Esempio n. 15
0
def train(num_workers, env_name="PongDeterministic-v3"):
    env = create_env(env_name, None, None)
    policy = LSTMPolicy(env.observation_space.shape, env.action_space.n, 0)
    agents = [Runner(env_name, i) for i in range(num_workers)]
    parameters = policy.get_weights()
    gradient_list = [agent.compute_gradient(parameters) for agent in agents]
    steps = 0
    obs = 0
    while True:
        done_id, gradient_list = ray.wait(gradient_list)
        gradient, info = ray.get(done_id)[0]
        policy.model_update(gradient)
        parameters = policy.get_weights()
        steps += 1
        obs += info["size"]
        gradient_list.extend([agents[info["id"]].compute_gradient(parameters)])
    return policy
Esempio n. 16
0
def train(num_workers, env_name="PongDeterministic-v3"):
    env = create_env(env_name)
    ps = ParameterServer(env)
    parameters = ps.get_weights()
    agents = [Runner.remote(env_name, i) for i in range(num_workers)]
    delta_list = [agent.get_delta.remote(parameters) for agent in agents]
    steps = 0
    obs = 0
    timing = []
    for i in range(2000):
        done_id, delta_list = ray.wait(delta_list)
        delta, info = ray.get(done_id)[0]
        ps.add_delta(delta)
        parameters = ps.weights
        obs += info["size"]
        delta_list.extend(
            [agents[info["id"]].compute_gradient.remote(parameters)])
    return policy
Esempio n. 17
0
def evaluate(constants):
    env, action_space = create_env(constants.env)
    agent = A3CAgent(action_space)
    worker = Worker(env.proxy, agent, GLOBAL_SCOPE, constants)
    worker.build_rollout()

    sess = tf.Session()

    sess.run(tf.global_variables_initializer())
    sess.run(tf.local_variables_initializer())

    # TODO: Model loading for A2C.
    if FLAGS.load_model:
        vars_to_save_load = tf.trainable_variables(GLOBAL_SCOPE)
        saver = tf.train.Saver(vars_to_save_load)
        worker.load_model(sess, saver, constants.model_directory)
    worker.evaluate(sess)
    env.close()
    sess.close()
Esempio n. 18
0
def async_train(args, make_model, train):
    setproctitle('{}:main'.format(args.name))

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    env = create_env(args.game_type, args.env_name, 'main', 1)
    shared_model = make_model(env.observation_space.shape[0],
                              env.action_space.n)
    shared_model.share_memory()

    if args.no_shared_optimizer:
        optimizer = None
    else:
        optimizer = SharedAdam(shared_model.parameters(), lr=args.lr)
        optimizer.share_memory()

    mp.set_start_method('spawn')
    global_steps = mp.Value('L', 0)

    processes = []
    processes.append(
        mp.Process(target=test, args=(shared_model, global_steps, args)))
    if not args.no_render:
        processes.append(
            mp.Process(target=show, args=(shared_model, global_steps, args)))

    for rank in range(args.n_processes):
        processes.append(
            mp.Process(target=train,
                       args=(shared_model, optimizer, rank, global_steps,
                             args)))

    for p in processes:
        p.start()
        time.sleep(0.1)

    for p in processes:
        p.join()

    print('Main process finished !!!')
Esempio n. 19
0
def test_penalty_env(env):
    import envs
    env = envs.create_env("Pong",
                          location="bottom",
                          catastrophe_type="1",
                          classifier_file=save_classifier_path +
                          '/0/final.ckpt')

    import matplotlib.pyplot as plt

    observation = env.reset()

    for _ in range(20):
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        plt.imshow(observation[:, :, 0])
        plt.show()
        print('Cat: ', info['frame/is_catastrophe'])
        print('reward: ', reward)
        if done:
            break
Esempio n. 20
0
def main():
    args = TrainOptions().parse()
    device = torch.device('cuda') if (not args.no_cuda and torch.cuda.is_available()) else torch.device('cpu')
    if not os.path.exists(args.output):
        os.makedirs(args.output)

    env = create_env(args)

    network = create_network(args, env.action_space.n, env.observation_space.shape)
    network.to(device)
    optimizer = Adam(network.parameters(), lr=args.lr)

    policy = AnnealedEpsilonGreedyPolicy(epsilon_max=args.epsilon_max,
                                         epsilon_min=args.epsilon_min, exploration_steps=args.exp_steps)
    memory = SimpleExperienceReplay(max_size=args.mem_max, batch_size=args.batch_size)
    logger = Logger()

    agent = create_agent(args, env, network, policy, memory, optimizer, logger)

    # train agent
    agent.learn(n_episodes=args.n_ep, ep_max_step=args.ep_max_step, replay_start_size=args.replay_start,
                save_every=args.freq_save_model, update_target_every=args.freq_target_update, render_every=args.freq_render)
Esempio n. 21
0
def main(args):
    env_id = args.env_id
    max_episodes = args.max_episodes
    ckpt_dir = args.ckpt_dir
    output_dir = args.output_dir

    # env
    env = create_env(env_id, 0, 1)
    if len(output_dir) > 0:
        env = wrappers.Monitor(env, output_dir)
    if args.render:
        env.render()

    # work-around to the nasty env.render() failing issue when working with tensorflow
    # see https://github.com/openai/gym/issues/418
    import tensorflow as tf
    from model import Convx2LSTMActorCritic

    # model
    sess = tf.Session()
    with tf.variable_scope("global"):
        network = Convx2LSTMActorCritic(env.observation_space.shape, env.action_space.n)
    init = tf.global_variables_initializer()
    sess.run(init)

    # load model parameters
    checkpoint = tf.train.get_checkpoint_state(ckpt_dir)
    if checkpoint and checkpoint.model_checkpoint_path:
        saver = tf.train.Saver()
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("checkpoint loaded:", checkpoint.model_checkpoint_path)
    else:
        raise Exception('cannot find checkpoint path')

    # run evaluating
    with sess.as_default():
        evaluate_loop(env, network, max_episodes, args)
Esempio n. 22
0
def evaluate_main(env_id,
                  model_id,
                  max_episodes,
                  ckpt_dir,
                  output_dir,
                  sleep_time,
                  render,
                  verbose,
                  with_global_step=False):
    # env
    env = create_env(env_id, 0, 1)
    if len(output_dir) > 0:  # output recording
        env = wrappers.Monitor(env, output_dir)
    if render:
        env.render()

    is_obs_tuple = isinstance(env.observation_space, Tuple)
    observation_shape = [sp.shape for sp in env.observation_space.spaces] if is_obs_tuple \
        else [env.observation_space.shape]
    action_shape = [env.action_space.n] if isinstance(env.action_space, spaces.Discrete) \
        else [env.action_space.shape[0]]

    # work-around to the nasty env.render() failing issue when working with tensorflow
    # see https://github.com/openai/gym/issues/418
    import tensorflow as tf
    from model import create_model
    use_tf_0_12_api = distutils.version.LooseVersion(tf.VERSION) >= distutils.version.LooseVersion('0.12.0') and \
                    distutils.version.LooseVersion(tf.VERSION) <= distutils.version.LooseVersion('0.12.1')
    use_tf_1_1_api = distutils.version.LooseVersion(
        tf.VERSION) == distutils.version.LooseVersion('1.1.0')

    # model
    tf.reset_default_graph()
    sess = tf.Session()
    with tf.variable_scope("global"):
        network = create_model(model_id, *observation_shape + action_shape)
        if (with_global_step):
            global_step = tf.get_variable("global_step", [],
                                          tf.int32,
                                          initializer=tf.constant_initializer(
                                              0, dtype=tf.int32),
                                          trainable=False)

    init = tf.global_variables_initializer()
    sess.run(init)

    # load model parameters
    checkpoint = tf.train.get_checkpoint_state(ckpt_dir)
    if checkpoint and checkpoint.model_checkpoint_path:

        restore_tf_0_12_model = False
        restore_tf_1_1_model = False

        reader = tf.train.NewCheckpointReader(checkpoint.model_checkpoint_path)
        for var_name in reader.get_variable_to_shape_map():
            if 'RNN/BasicLSTMCell/Linear' in var_name:
                restore_tf_0_12_model = True
                break
            elif 'rnn/basic_lstm_cell/' in var_name:
                restore_tf_1_1_model = True
                break

        if use_tf_1_1_api and restore_tf_0_12_model:
            var_dict = {}
            for var in tf.global_variables():
                name = var.name.split(':')[0]
                if 'rnn/basic_lstm_cell/weights' in name:
                    name = name.replace('rnn/basic_lstm_cell/weights',
                                        'RNN/BasicLSTMCell/Linear/Matrix')
                elif 'rnn/basic_lstm_cell/biases' in name:
                    name = name.replace('rnn/basic_lstm_cell/biases',
                                        'RNN/BasicLSTMCell/Linear/Bias')
                var_dict[name] = var
            saver = tf.train.Saver(var_dict)
        elif use_tf_0_12_api and restore_tf_1_1_model:
            var_dict = {}
            for var in tf.global_variables():
                name = var.name.split(':')[0]
                if 'RNN/BasicLSTMCell/Linear/Matrix' in name:
                    name = name.replace('RNN/BasicLSTMCell/Linear/Matrix',
                                        'rnn/basic_lstm_cell/weights')
                elif 'RNN/BasicLSTMCell/Linear/Bias' in name:
                    name = name.replace('RNN/BasicLSTMCell/Linear/Bias',
                                        'rnn/basic_lstm_cell/biases')
                var_dict[name] = var
            saver = tf.train.Saver(var_dict)
        else:
            saver = tf.train.Saver()

        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("checkpoint loaded:", checkpoint.model_checkpoint_path)
    else:
        raise Exception('cannot find checkpoint path')

    # run evaluating
    with sess.as_default():
        ret = evaluate_loop(env, network, max_episodes, sleep_time, render,
                            verbose)
        env.close()
        if (with_global_step):
            global_step_result = sess.run(global_step)
    sess.close()

    if (with_global_step):
        return ret, global_step_result
    else:
        return ret
Esempio n. 23
0
def generate_data(rank, args, start, end):

    from envs import create_env, set_seed, get_obs
    from model import R_Module
    import torch

    print(rank, "started")

    env = create_env(args.env_name, framework=args.framework, args=args)
    env = set_seed(args.seed + rank, env, args.framework)
    state = get_obs(env, args.framework)

    if args.from_policy is not None:
        model_state, r_args = torch.load(args.from_policy)
        policy = R_Module(env.action_space.shape[0],
                          r_args.dim,
                          discrete=r_args.discrete,
                          baseline=r_args.baseline,
                          state_space=env.observation_space.shape[0])
        policy.load_state_dict(model_state)
        policy.eval()

    states = []
    actions = []
    i = start

    done = False

    while i < end:
        if i % 100 == 0:
            print(rank, i)
        ep_states = []
        ep_actions = []
        if args.from_policy is not None:
            cx_p = Variable(torch.zeros(1, r_args.dim))
            hx_p = Variable(torch.zeros(1, r_args.dim))
        for j in range(args.rollout):
            if args.from_policy is not None:
                value, logit, (hx_p, cx_p) = policy(state.unsqueeze(0),
                                                    (hx_p, cx_p))
                a, _, _ = get_action(logit, r_args.discrete)
            else:
                a = env.action_space.sample()
            ep_actions.append(a)

            state = get_obs(env, args.framework)
            env.step(a)

            if args.render:
                env.render()

            ep_states.append(state)

        final_state = get_obs(env, args.framework)
        ep_states.append(final_state)
        states.append(ep_states)
        actions.append(ep_actions)
        i += 1

        # reset the environment here
        if done or args.reset:
            env.reset()
            done = False

    torch.save((states, actions),
               os.path.join(args.out_dir,
                            'states_actions_%s_%s.pt' % (start, end)))
Esempio n. 24
0
    import torch.multiprocessing as mp
    mp.set_start_method('spawn')

    from torch.autograd import Variable
    from envs import create_env, set_seed, get_obs
    from model import R_Module
    os.environ['OMP_NUM_THREADS'] = '1'

    args = parser.parse_args()
    env_name = args.env_name
    env_name += '_rollout%s' % args.rollout
    if args.env_name.endswith('MazeEnv'):
        env_name += 'mazeid%slength%s' % (args.maze_id, args.maze_length)
        if args.single_env and args.maze_id == -1:
            env = create_env(args.env_name,
                             framework=args.framework,
                             args=args)
            env_name += '_single_env'
            args.maze_structure = env._env.MAZE_STRUCTURE
        if args.random_start:
            env_name += '_randomstart'
    if args.file_path is not None:
        env_name += '_transfer'
    if args.framework == 'mazebase':
        env_name += '_rollout_%s_length_%s' % (args.rollout, args.maze_length)
    args.out_dir = os.path.join(args.out, env_name)
    print(args)
    print(args.out_dir)
    os.makedirs(args.out_dir, exist_ok=True)
    processes = []
    block = int(args.N / args.num_processes)
Esempio n. 25
0
def run(args, server):
    env = create_env(args.env_id, client_id=str(args.task), remotes=args.remotes, envWrap=args.envWrap, designHead=args.designHead,
                        noLifeReward=args.noLifeReward)
    trainer = A3C(env, args.task, args.visualise, args.unsup, args.envWrap, args.designHead, args.noReward)

    # logging
    if args.task == 0:
        with open(args.log_dir + '/log.txt', 'w') as fid:
            for key, val in constants.items():
                fid.write('%s: %s\n'%(str(key), str(val)))
            fid.write('designHead: %s\n'%args.designHead)
            fid.write('input observation: %s\n'%str(env.observation_space.shape))
            fid.write('env name: %s\n'%str(env.spec.id))
            fid.write('unsup method type: %s\n'%str(args.unsup))

    # Variable names that start with "local" are not saved in checkpoints.
    if use_tf12_api:
        variables_to_save = [v for v in tf.global_variables() if not v.name.startswith("local")]
        init_op = tf.variables_initializer(variables_to_save)
        init_all_op = tf.global_variables_initializer()
    else:
        variables_to_save = [v for v in tf.all_variables() if not v.name.startswith("local")]
        init_op = tf.initialize_variables(variables_to_save)
        init_all_op = tf.initialize_all_variables()
    saver = FastSaver(variables_to_save)
    if args.pretrain is not None:
        variables_to_restore = [v for v in tf.trainable_variables() if not v.name.startswith("local")]
        pretrain_saver = FastSaver(variables_to_restore)

    var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)
    logger.info('Trainable vars:')
    for v in var_list:
        logger.info('  %s %s', v.name, v.get_shape())

    def init_fn(ses):
        logger.info("Initializing all parameters.")
        ses.run(init_all_op)
        if args.pretrain is not None:
            pretrain = tf.train.latest_checkpoint(args.pretrain)
            logger.info("==> Restoring from given pretrained checkpoint.")
            logger.info("    Pretraining address: %s", pretrain)
            pretrain_saver.restore(ses, pretrain)
            logger.info("==> Done restoring model! Restored %d variables.", len(variables_to_restore))

    config = tf.ConfigProto(device_filters=["/job:ps", "/job:worker/task:{}/cpu:0".format(args.task)])
    logdir = os.path.join(args.log_dir, 'train')

    if use_tf12_api:
        summary_writer = tf.summary.FileWriter(logdir + "_%d" % args.task)
    else:
        summary_writer = tf.train.SummaryWriter(logdir + "_%d" % args.task)

    logger.info("Events directory: %s_%s", logdir, args.task)
    sv = tf.train.Supervisor(is_chief=(args.task == 0),
                             logdir=logdir,
                             saver=saver,
                             summary_op=None,
                             init_op=init_op,
                             init_fn=init_fn,
                             summary_writer=summary_writer,
                             ready_op=tf.report_uninitialized_variables(variables_to_save),
                             global_step=trainer.global_step,
                             save_model_secs=30,
                             save_summaries_secs=30)

    num_global_steps = constants['MAX_GLOBAL_STEPS']

    logger.info(
        "Starting session. If this hangs, we're mostly likely waiting to connect to the parameter server. " +
        "One common cause is that the parameter server DNS name isn't resolving yet, or is misspecified.")
    with sv.managed_session(server.target, config=config) as sess, sess.as_default():
        # Workaround for FailedPreconditionError
        # see: https://github.com/openai/universe-starter-agent/issues/44 and 31
        sess.run(trainer.sync)

        trainer.start(sess, summary_writer)
        global_step = sess.run(trainer.global_step)
        logger.info("Starting training at gobal_step=%d", global_step)
        while not sv.should_stop() and (not num_global_steps or global_step < num_global_steps):
            trainer.process(sess)
            global_step = sess.run(trainer.global_step)

    # Ask for all the services to stop.
    sv.stop()
    logger.info('reached %s steps. worker stopped.', global_step)
Esempio n. 26
0
def inference(args):
    """
    It only restores LSTMPolicy architecture, and does inference using that.
    """
    # get address of checkpoints
    indir = os.path.join(args.log_dir, 'train')
    outdir = os.path.join(
        args.log_dir, 'inference') if args.out_dir is None else args.out_dir
    with open(indir + '/checkpoint', 'r') as f:
        first_line = f.readline().strip()
    ckpt = first_line.split(' ')[-1].split('/')[-1][:-1]
    ckpt = ckpt.split('-')[-1]
    ckpt = indir + '/model.ckpt-' + ckpt

    # define environment
    if args.record:
        env = create_env(args.env_id,
                         client_id='0',
                         remotes=None,
                         envWrap=args.envWrap,
                         designHead=args.designHead,
                         record=True,
                         noop=args.noop,
                         acRepeat=args.acRepeat,
                         outdir=outdir)
    else:
        env = create_env(args.env_id,
                         client_id='0',
                         remotes=None,
                         envWrap=args.envWrap,
                         designHead=args.designHead,
                         record=True,
                         noop=args.noop,
                         acRepeat=args.acRepeat)
    numaction = env.action_space.n

    with tf.device("/cpu:0"):
        # define policy network
        with tf.variable_scope("global"):
            policy = LSTMPolicy(env.observation_space.shape, numaction,
                                args.designHead)
            policy.global_step = tf.get_variable(
                "global_step", [],
                tf.int32,
                initializer=tf.constant_initializer(0, dtype=tf.int32),
                trainable=False)

        # Variable names that start with "local" are not saved in checkpoints.
        if use_tf12_api:
            variables_to_restore = [
                v for v in tf.global_variables()
                if not v.name.startswith("local")
            ]
            init_all_op = tf.global_variables_initializer()
        else:
            variables_to_restore = [
                v for v in tf.all_variables() if not v.name.startswith("local")
            ]
            init_all_op = tf.initialize_all_variables()
        saver = FastSaver(variables_to_restore)

        # print trainable variables
        var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                     tf.get_variable_scope().name)
        logger.info('Trainable vars:')
        for v in var_list:
            logger.info('  %s %s', v.name, v.get_shape())

        # summary of rewards
        action_writers = []
        if use_tf12_api:
            summary_writer = tf.summary.FileWriter(outdir)
            for ac_id in range(numaction):
                action_writers.append(
                    tf.summary.FileWriter(
                        os.path.join(outdir, 'action_{}'.format(ac_id))))
        else:
            summary_writer = tf.train.SummaryWriter(outdir)
            for ac_id in range(numaction):
                action_writers.append(
                    tf.train.SummaryWriter(
                        os.path.join(outdir, 'action_{}'.format(ac_id))))
        logger.info("Inference events directory: %s", outdir)

        config = tf.ConfigProto(allow_soft_placement=True,
                                log_device_placement=False)
        with tf.Session(config=config) as sess:
            logger.info("Initializing all parameters.")
            sess.run(init_all_op)
            logger.info("Restoring trainable global parameters.")
            saver.restore(sess, ckpt)
            logger.info("Restored model was trained for %.2fM global steps",
                        sess.run(policy.global_step) / 1000000.)
            #saving with meta graph:
            metaSaver = tf.train.Saver(variables_to_restore)
            metaSaver.save(
                sess, '/home/swagking0/noreward-rl/models/models_me/mario_me')

            last_state = env.reset()
            if args.render or args.record:
                env.render()
            last_features = policy.get_initial_features()  # reset lstm memory
            length = 0
            rewards = 0
            mario_distances = np.zeros((args.num_episodes, ))
            for i in range(args.num_episodes):
                print("Starting episode %d" % (i + 1))
                if args.recordSignal:
                    from PIL import Image
                    signalCount = 1
                    utils.mkdir_p(outdir + '/recordedSignal/ep_%02d/' % i)
                    Image.fromarray(
                        (255 * last_state[..., -1]).astype('uint8')).save(
                            outdir + '/recordedSignal/ep_%02d/%06d.jpg' %
                            (i, signalCount))

                if args.random:
                    print('I am random policy!')
                else:
                    if args.greedy:
                        print('I am greedy policy!')
                    else:
                        print('I am sampled policy!')
                while True:
                    # run policy
                    fetched = policy.act_inference(last_state, *last_features)
                    prob_action, action, value_, features = fetched[
                        0], fetched[1], fetched[2], fetched[3:]

                    # run environment: sampled one-hot 'action' (not greedy)
                    if args.random:
                        stepAct = np.random.randint(0,
                                                    numaction)  # random policy
                    else:
                        if args.greedy:
                            stepAct = prob_action.argmax()  # greedy policy
                        else:
                            stepAct = action.argmax()
                    # print(stepAct, prob_action.argmax(), prob_action)
                    state, reward, terminal, info = env.step(stepAct)

                    # update stats
                    length += 1
                    rewards += reward
                    last_state = state
                    last_features = features
                    if args.render or args.record:
                        env.render()
                    if args.recordSignal:
                        signalCount += 1
                        Image.fromarray(
                            (255 * last_state[..., -1]).astype('uint8')).save(
                                outdir + '/recordedSignal/ep_%02d/%06d.jpg' %
                                (i, signalCount))

                    # store summary
                    summary = tf.Summary()
                    summary.value.add(tag='ep_{}/reward'.format(i),
                                      simple_value=reward)
                    summary.value.add(tag='ep_{}/netreward'.format(i),
                                      simple_value=rewards)
                    summary.value.add(tag='ep_{}/value'.format(i),
                                      simple_value=float(value_[0]))
                    if 'NoFrameskip-v' in args.env_id:  # atari
                        summary.value.add(
                            tag='ep_{}/lives'.format(i),
                            simple_value=env.unwrapped.ale.lives())
                    summary_writer.add_summary(summary, length)
                    summary_writer.flush()
                    summary = tf.Summary()
                    for ac_id in range(numaction):
                        summary.value.add(tag='action_prob',
                                          simple_value=float(
                                              prob_action[ac_id]))
                        action_writers[ac_id].add_summary(summary, length)
                        action_writers[ac_id].flush()

                    timestep_limit = env.spec.tags.get(
                        'wrapper_config.TimeLimit.max_episode_steps')
                    if timestep_limit is None:
                        timestep_limit = env.spec.timestep_limit
                    if terminal or length >= timestep_limit:
                        if length >= timestep_limit or not env.metadata.get(
                                'semantics.autoreset'):
                            last_state = env.reset()
                        last_features = policy.get_initial_features(
                        )  # reset lstm memory
                        print(
                            "Episode finished. Sum of rewards: %.2f. Length: %d."
                            % (rewards, length))
                        if 'distance' in info:
                            print('Mario Distance Covered:', info['distance'])
                            mario_distances[i] = info['distance']
                        length = 0
                        rewards = 0
                        if args.render or args.record:
                            env.render()
                        if args.recordSignal:
                            signalCount += 1
                            Image.fromarray(
                                (255 *
                                 last_state[..., -1]).astype('uint8')).save(
                                     outdir +
                                     '/recordedSignal/ep_%02d/%06d.jpg' %
                                     (i, signalCount))
                        break

        logger.info('Finished %d true episodes.', args.num_episodes)
        if 'distance' in info:
            print('Mario Distances:', mario_distances)
            np.save(outdir + '/distances.npy', mario_distances)
        env.close()
Esempio n. 27
0
def train(shared_model, shared_optimizer, rank, args, info):
    env = create_env()  # make a local (unshared) environment

    torch.manual_seed(args.seed + rank)  # seed everything
    model = NNPolicy(channels=1,
                     memsize=args.hidden,
                     num_actions=args.num_actions).to(
                         device=args.device)  # a local/unshared model
    state = torch.tensor(prepro(env.reset())).to(
        device=args.device)  # get first state

    start_time = last_disp_time = time.time()
    episode_length, epr, eploss, done = 0, 0, 0, True  # bookkeeping

    steps_done = 1

    while info['frames'][
            0] <= 8e8 or args.test:  # openai baselines uses 40M frames...we'll use 80M
        model.load_state_dict(
            shared_model.state_dict())  # sync with shared model

        # hx = torch.zeros(1, args.hidden) if done else hx.detach()  # rnn activation vector
        # hx = torch.randn(1, args.hidden) if done else hx.detach()
        hx = torch.randn(1, args.hidden) if done else hx.detach()
        values, logps, actions, rewards = [], [], [], [
        ]  # save values for computing gradientss

        for step in range(args.rnn_steps):
            # for step in range(0, np.random.randint(10, 40)):
            episode_length += 1
            value, logit, hx = model(
                (state.view(1, 1, 160, 160), hx.to(device=args.device)))
            logp = F.log_softmax(logit, dim=-1)

            action = torch.exp(logp).multinomial(
                num_samples=1).data[0]  #logp.max(1)[1].data if args.test else
            state, reward, done = env.step(
                action.cpu().item())  # action.cpu().numpy()[0]

            state = torch.tensor(prepro(state)).to(args.device)
            epr += reward
            # reward = np.clip(reward, -1, 1) # reward
            done = done or episode_length >= 1e4  # don't playing one ep for too long

            info['frames'].add_(1)
            num_frames = int(info['frames'].item())
            if num_frames % 1e6 == 0:  # save every 2M frames
                torch.save(
                    shared_model.state_dict(), args.save_dir +
                    'model.{:.0f}.tar'.format(num_frames / 1e6))
                printlog(
                    args, '\n\t{:.0f}M frames: saved model\n'.format(
                        num_frames / 1e6))

            if done:  # update shared data
                info['episodes'] += 1
                interp = 1 if info['episodes'][0] == 1 else 1 - args.horizon
                info['run_epr'].mul_(1 - interp).add_(interp * epr)
                info['run_loss'].mul_(1 - interp).add_(interp * eploss)

            if rank == 0 and time.time(
            ) - last_disp_time > 60:  # print info ~ every minute
                elapsed = time.strftime("%Hh %Mm %Ss",
                                        time.gmtime(time.time() - start_time))
                printlog2(
                    args, info['run_epr'].item(), info['run_loss'].item(),
                    num_frames,
                    'time {}, episodes {:.0f}, frames {:.1f}M, mean epr {:.2f}, run loss {:.2f}'
                    .format(elapsed, info['episodes'].item(), num_frames / 1e6,
                            info['run_epr'].item(), info['run_loss'].item()))

                last_disp_time = time.time()

            if done:  # maybe print info.
                # reward = 500

                episode_length, epr, eploss = 0, 0, 0
                state = torch.tensor(prepro(env.reset())).to(args.device)

            values.append(value)
            logps.append(logp)
            actions.append(action)
            rewards.append(reward)

        next_value = torch.zeros(1, 1).to(
            device=args.device) if done else model(
                (state.view(1, 1, 160, 160), hx))[0]
        values.append(next_value.detach())

        re = np.asarray(rewards)  # + 1
        _n = LA.norm(re)
        # print(np.nan_to_num((re / _n)))

        # loss = cost_func(args, torch.cat(values).cpu(), torch.cat(logps).cpu(), torch.cat(actions).cpu(), torch.from_numpy(np.asarray(rewards)))
        # loss = cost_func(args, torch.cat(values).cpu(), torch.cat(logps).cpu(), torch.cat(actions).cpu(), torch.from_numpy(np.asarray(rewards)).float().to(args.device))
        loss = cost_func(args,
                         torch.cat(values).cpu(),
                         torch.cat(logps).cpu(),
                         torch.cat(actions).cpu(),
                         torch.from_numpy(np.nan_to_num((re / _n))).cpu())
        eploss += loss.item()
        shared_optimizer.zero_grad()
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 40)

        for param, shared_param in zip(model.parameters(),
                                       shared_model.parameters()):
            if shared_param.grad is None:
                shared_param._grad = param.grad  # sync gradients with shared model
        shared_optimizer.step()
Esempio n. 28
0
def run(args, server):
    env = create_env(args.env_id, client_id=str(args.task), remotes=args.remotes)
    if args.teacher:
        teacher = model.LSTMPolicy(env.observation_space.shape, env.action_space.n, name="global")
        teacher_init_op = teacher.load_model_from_checkpoint(args.checkpoint_path)

        trainer = A3C(env, args.task, args.visualise, teacher= teacher, name="student")

    else:
        teacher = None
        trainer = A3C(env, args.task, args.visualise, teacher= teacher)

    # Variable names that start with "local" are not saved in checkpoints.
    if use_tf12_api:
        variables_to_save = trainer.global_var_list
        all_trainable_variables = [v for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) if trainer.scope in v.name]
        init_op = tf.variables_initializer(variables_to_save)
        init_all_op = tf.variables_initializer(all_student_variables)

    else:

        variables_to_save = trainer.global_var_list
        init_op = tf.initialize_variables(variables_to_save)
        all_trainable_variables = [v for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) if trainer.scope in v.name]
        init_all_op = tf.variables_initializer(all_student_variables)

    saver = FastSaver(variables_to_save)

    logger.info('Trainable vars:')

    for v in all_trainable_variables:
        logger.info('{} {}'.format(v.name, v.get_shape()))

    def init_fn(ses):
        logger.info("Initializing all parameters.")
        ses.run([init_all_op])

    def get_init_fn():
        if args.teacher:
            return tf.contrib.framework.assign_from_checkpoint_fn(
                    args.checkpoint_path,
                    teacher.var_list,
                    ignore_missing_vars=True)
        else:
            return lambda sess: init_fn(sess)

    config = tf.ConfigProto(device_filters=["/job:ps", "/job:worker/task:{}/cpu:0".format(args.task)])
    logdir = os.path.join(args.log_dir, 'train')

    if use_tf12_api:
        summary_writer = tf.summary.FileWriter(logdir + "_{}".format(args.task))
    else:
        summary_writer = tf.train.SummaryWriter(logdir + "_'{}".format(args.task))

    logger.info("Events directory: {}_{}".format(logdir, args.task))

    sv = tf.train.Supervisor(is_chief=(args.task == 0),
                             logdir=logdir,
                             saver=saver,
                             summary_op=None,
                             init_op=init_op,
                             init_fn=get_init_fn(),
                             summary_writer=summary_writer,
                             ready_op=tf.report_uninitialized_variables(variables_to_save),
                             global_step=trainer.global_step,
                             save_model_secs=30,
                             save_summaries_secs=30)

    num_global_steps = 100000000

    logger.info(
        "Starting session. If this hangs, we're mostly likely waiting to connect to the parameter server. " +
        "One common cause is that the parameter server DNS name isn't resolving yet, or is misspecified.")
    with sv.managed_session(server.target, config=config) as sess, sess.as_default():
        sess.run(trainer.sync)
        trainer.start(sess, summary_writer)
        global_step = sess.run(trainer.global_step)
        logger.info("Starting training at step={}".format(global_step))
        while not sv.should_stop() and (not num_global_steps or global_step < num_global_steps):
            trainer.process(sess)
            global_step = sess.run(trainer.global_step)

    # Ask for all the services to stop.
    sv.stop()
    logger.info('reached {} steps. worker stopped.'.format(global_step))
Esempio n. 29
0
def run(env_name, version, act_rep, max_steps, rollout_agent_name,
        behavior_agent_name, eps_greedy, sim_steps, search_horizont,
        gamma=1., exploration=1., prune_tree=False, report_freq=100,
        n_runs=1, save_dir=None, save_freq=10, process=0):

    def save_data():
        if save_dir is not None and len(frames) > 0:
            run_data = {
                'frames': frames,
                'actions': actions,
                'reward': total_reward,
                'action_visits': action_visits,
                'action_values': action_values,
                'rewards': rewards,
                'action_meanings': env.env.get_action_meanings(),
            }
            fname = os.path.join(save_dir, 'run_process_{}_run_{}_steps_{}.pkl'.format(process, n_run, step))
            with open(fname, 'wb') as f:
                cPickle.dump(run_data, f, -1)

            del actions[:]
            del frames[:]
            del action_visits[:]
            del action_values[:]
            del rewards[:]

    env = create_env(env_name, version, act_rep)
    uct.Node.n_actions = env.action_space.n

    # agent for rollouts
    if rollout_agent_name == 'random' or rollout_agent_name is None:
        rollout_agent = RandomAgent(env.action_space.n)
    else:
        rollout_agent = KerasAgent(rollout_agent_name)

    # agent for action selections
    if behavior_agent_name == 'random':
        behavior_agent = RandomAgent(env.action_space.n)
    elif behavior_agent_name == 'uct' or behavior_agent_name is None:
        behavior_agent = 'uct'
    else:
        behavior_agent = KerasAgent(behavior_agent_name)

    if save_dir is not None:
        actions = []
        frames = []
        action_visits = []
        action_values = []
        rewards = []

    for n_run in xrange(n_runs):
        terminal = False

        env.reset()
        _frame = env.env._get_image()

        node = uct.Node(env.clone_state())

        total_reward = 0
        step = 0
        t_start = t0 = time()
        while not terminal:
            # choose uct action
            a_uct = uct.uct_action(env, rollout_agent, node, sim_steps, search_horizont, gamma, exploration)

            # choose action in environment
            if np.random.rand() < eps_greedy:
                a = env.action_space.sample()
            elif behavior_agent == 'uct':
                a = a_uct
            else:
                a = behavior_agent.choose_action(_frame)

            if save_dir is not None:
                actions.append(a_uct)
                frames.append(_frame)
                action_visits.append(node.a_visits)
                action_values.append(node.a_values)

            # do step in environment
            env.restore_state(node.state)
            frame, reward, terminal, _ = env.step(a)
            _frame = env.env._get_image()

            if save_dir is not None:
                rewards.append(reward)

            # create new tree or try to use old tree
            if prune_tree:
                if frame in node.childs[a]:
                    node = node.childs[a][frame]
                    node.parent = None
                else:
                    node = uct.Node(env.clone_state())
            else:
                node = uct.Node(env.clone_state())

            total_reward += reward
            step += 1

            # report progress
            if step % report_freq == 0:
                print 'process: {} run: {}, steps: {}, time: {:.2f}, total reward: {:.2f}'.\
                    format(process, n_run+1, step, time() - t0, total_reward)
                t0 = time()

            # save intermediate result
            if step % save_freq == 0:
                save_data()

            if 0 < max_steps < step:
                break

        print '\nprocess: {}, run: {}, total steps: {}, total time: {:.2f}, total reward: {:.2f}'.\
            format(process, n_run+1, step, time() - t_start, total_reward)

        # save last chunk of data
        save_data()

    env.close()
Esempio n. 30
0
def inference(args):
    """
    It restore policy weights, and does inference.
    """
    # virtual display (headless remotes)
    virtual_display = Display(visible=0, size=(1400, 900))
    virtual_display.start()

    # define environment
    env = create_env(args.env_id, client_id='0', remotes=None, envWrap=True,
                        acRepeat=1, record=args.record, outdir=args.outdir)
    num_actions = env.action_space.n

    with tf.device("/cpu:0"):
        config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
        with tf.Session(config=config) as sess:
            logger.info("Restoring trainable global parameters.")
            saver = tf.train.import_meta_graph(args.ckpt+'.meta')
            saver.restore(sess, args.ckpt)

            probs = tf.get_collection("probs")[0]
            sample = tf.get_collection("sample")[0]
            vf = tf.get_collection("vf")[0]
            state_out_0 = tf.get_collection("state_out_0")[0]
            state_out_1 = tf.get_collection("state_out_1")[0]

            last_state = env.reset()
            if args.render or args.record:
                env.render()
            last_features = np.zeros((1, 256), np.float32); last_features = [last_features, last_features]
            length = 0
            rewards = 0
            mario_distances = np.zeros((args.num_episodes,))
            for i in range(args.num_episodes):
                print("Starting episode %d" % (i + 1))
                if args.random:
                    print('I am a random policy!')
                else:
                    if args.greedy:
                        print('I am a greedy policy!')
                    else:
                        print('I am a sampled policy!')
                while True:
                    # run policy
                    fetched = sess.run([probs, sample, vf, state_out_0, state_out_1] ,
                                {"global/x:0": [last_state], "global/c_in:0": last_features[0], "global/h_in:0": last_features[1]})
                    prob_action, action, value_, features = fetched[0], fetched[1], fetched[2], fetched[3:]

                    # run environment
                    if args.random:
                        stepAct = np.random.randint(0, num_actions)  # random policy
                    else:
                        if args.greedy:
                            stepAct = prob_action.argmax()  # greedy policy
                        else:
                            stepAct = action.argmax()
                    state, reward, terminal, info = env.step(stepAct)

                    # update stats
                    length += 1
                    rewards += reward
                    last_state = state
                    last_features = features
                    if args.render or args.record:
                        env.render(mode='rgb_array') # set to rgb_array by default (assumes running on a headless remote)

                    timestep_limit = env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps')
                    if timestep_limit is None: timestep_limit = env.spec.timestep_limit
                    if terminal or length >= timestep_limit:
                        if length >= timestep_limit or not env.metadata.get('semantics.autoreset'):
                            last_state = env.reset()
                        last_features = np.zeros((1, 256), np.float32); last_features = [last_features, last_features]
                        print("Episode finished. Sum of rewards: %.2f. Length: %d." % (rewards, length))
                        length = 0
                        rewards = 0
                        if args.render or args.record:
                            env.render(mode='rgb_array')
                        break

        logger.info('Finished %d true episodes.', args.num_episodes)
        env.close()
Esempio n. 31
0
parser.add_argument('--no-shared',
                    default=False,
                    help='use an optimizer without shared momentum.')
parser.add_argument('--memsize', type=int, default=256)
parser.add_argument('--device', type=str, default='cuda')

if __name__ == '__main__':
    # os.environ['OMP_NUM_THREADS'] = '1'
    # os.environ['CUDA_VISIBLE_DEVICES'] = "0"

    args = parser.parse_args()
    args.device = torch.device(args.device)

    args.env_name = 'FlappyBird-v0'

    env = create_env()

    shared_model = ActorCritic(1, args.memsize, len(env.action_space))
    shared_model.share_memory()

    if args.no_shared:
        optimizer = None
    else:
        optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=args.lr)
        optimizer.share_memory()

    processes = []

    # counter = mp.Value('i', 0)
    # lock = mp.Lock()