def new_env(args): config = open(args.config) if args.config != "" else None env = create_env(args.env_id, str(args.task), args.remotes, config=config) return env
def show(shared_model, global_steps, args): setproctitle('{}:show'.format(args.name)) try: env = create_env(args.game_type, args.env_name, 'show', 1) model = copy.deepcopy(shared_model) gpu_id = args.gpu_ids[-2] with torch.cuda.device(gpu_id): model = model.cuda() if gpu_id >= 0 else model model.eval() while True: # Sync with the shared model with torch.cuda.device(gpu_id): model.load_state_dict(shared_model.state_dict()) model.reset() play_game(env, model, args.max_episode_length, render=True, gpu_id=gpu_id) if global_steps.value >= args.max_global_steps: break except KeyboardInterrupt: raise finally: print('Player Finished !!!')
def new_env(args): config = args.config if isinstance(args.config, str): config = open(args.config) if args.config != "" else None config = bs4.BeautifulSoup(config, "lxml") env = create_env(args.env_id, str(args.task), args.remotes, config=config) return env
def run(args): env = create_env(args.env_id, client_id=str(args.task), remotes=args.remotes) trainer = A3C(env, args.task, args.visualise, args.num_workers, args.worker_id, args.verbose_lvl) # Variable names that start with "local" are not saved in checkpoints. if use_tf12_api: variables_to_save = [ v for v in tf.global_variables() if not v.name.startswith("local") ] init_op = tf.variables_initializer(variables_to_save) init_all_op = tf.global_variables_initializer() else: variables_to_save = [ v for v in tf.all_variables() if not v.name.startswith("local") ] init_op = tf.initialize_variables(variables_to_save) init_all_op = tf.initialize_all_variables() print variables_to_save saver = FastSaver(variables_to_save) var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) logger.info('Trainable vars:') for v in var_list: logger.info(' %s %s', v.name, v.get_shape()) def init_fn(ses): logger.info("Initializing all parameters.") ses.run(init_all_op) logdir = os.path.join(args.log_dir, 'train') if use_tf12_api: summary_writer = tf.summary.FileWriter(logdir + "_%d" % args.task) else: summary_writer = tf.train.SummaryWriter(logdir + "_%d" % args.task) logger.info("Events directory: %s_%s", logdir, args.task) logger.info( "Starting session. If this hangs, we're mostly likely waiting to connect to the parameter server. " + "One common cause is that the parameter server DNS name isn't resolving yet, or is misspecified." ) with tf.Session() as sess: init = tf.global_variables_initializer() sess.run(init) trainer.start_listen_thread() trainer.sync_initial_weights(sess, var_list) trainer.start(sess, summary_writer) while True: trainer.process(sess)
def run(args): env = create_env(args.env_id) trainer = A3C(env, None, args.visualise, args.intrinsic_type, args.bptt) # Variable names that start with "local" are not saved in checkpoints. variables_to_save = [v for v in tf.global_variables() if not v.name.startswith("local")] init_op = tf.variables_initializer(variables_to_save) init_all_op = tf.global_variables_initializer() saver = FastSaver(variables_to_save) var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) logger.info('Trainable vars:') for v in var_list: logger.info(' %s %s', v.name, v.get_shape()) def init_fn(ses): logger.info("Initializing all parameters.") ses.run(init_all_op) logdir = os.path.join(args.log_dir, 'train') summary_writer = tf.summary.FileWriter(logdir) logger.info("Events directory: %s", logdir) sv = tf.train.Supervisor(is_chief=True, logdir=logdir, saver=saver, summary_op=None, init_op=init_op, init_fn=init_fn, summary_writer=summary_writer, ready_op=tf.report_uninitialized_variables(variables_to_save), global_step=None, save_model_secs=0, save_summaries_secs=0) video_dir = os.path.join(args.log_dir, 'test_videos_' + args.intrinsic_type) if not os.path.exists(video_dir): os.makedirs(video_dir) video_filename = video_dir + "/%s_%02d_%d.gif" print("Video saved at %s" % video_dir) with sv.managed_session() as sess, sess.as_default(): trainer.start(sess, summary_writer) rewards = [] lengths = [] for i in range(10): frames, reward, length = trainer.evaluate(sess) rewards.append(reward) lengths.append(length) imageio.mimsave(video_filename % (args.env_id, i, reward), frames, fps=30) print('Evaluation: avg. reward %.2f avg.length %.2f' % (sum(rewards) / 10.0, sum(lengths) / 10.0)) # Ask for all the services to stop. sv.stop()
def main(): args = parse_args() env = envs.create_env(args.domain, args.task, args.verbose) agent = agents.create_agent(args.model, env, args.verbose) if args.train: agent.train(env, args.save_model, args.verbose, args.display, args.save_training_curve) elif args.eval: agent.eval(env, args.verbose, args.display)
def run(args, server): env = create_env(args.env_id, client_id=str(args.task), remotes=args.remotes) trainer = A3C(env, args.task, args.visualise) # Variable names that start with "local" are not saved in checkpoints. variables_to_save = [v for v in tf.global_variables() if not v.name.startswith("local")] init_op = tf.variables_initializer(variables_to_save) init_all_op = tf.global_variables_initializer() saver = FastSaver(variables_to_save) var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) logger.info('Trainable vars:') for v in var_list: logger.info(' %s %s', v.name, v.get_shape()) def init_fn(ses): logger.info("Initializing all parameters.") ses.run(init_all_op) config = tf.ConfigProto(device_filters=["/job:ps", "/job:worker/task:{}/cpu:0".format(args.task)]) logdir = os.path.join(args.log_dir, 'train') summary_writer = tf.summary.FileWriter(logdir + "_%d" % args.task) logger.info("Events directory: %s_%s", logdir, args.task) sv = tf.train.Supervisor(is_chief=(args.task == 0), logdir=logdir, saver=saver, summary_op=None, init_op=init_op, init_fn=init_fn, summary_writer=summary_writer, ready_op=tf.report_uninitialized_variables(variables_to_save), global_step=trainer.global_step, save_model_secs=30, save_summaries_secs=30) num_global_steps = 100000000 logger.info( "Starting session. If this hangs, we're mostly likely waiting to connect to the parameter server. " + "One common cause is that the parameter server DNS name isn't resolving yet, or is misspecified.") with sv.managed_session(server.target, config=config) as sess, sess.as_default(): sess.run(trainer.sync) trainer.start(sess, summary_writer) global_step = sess.run(trainer.global_step) logger.info("Starting training at step=%d", global_step) while not sv.should_stop() and (not num_global_steps or global_step < num_global_steps): trainer.process(sess) global_step = sess.run(trainer.global_step) # Ask for all the services to stop. sv.stop() logger.info('reached %s steps. worker stopped.', global_step)
def new_env(args): config = open(args.config) if args.config != "" else None env = create_env(args.env_id, str(args.task), args.remotes, config=config) if args.env_id == 'maze': return env path = os.getcwd() + '/tmp/vpn_record_1' if os.path.exists(path): print "removing old directory" + path env = wrappers.Monitor(env, path) return env
def __init__(self, env_name, actor_id, logdir="results/", start=True): env = create_env(env_name) self.id = actor_id num_actions = env.action_space.n self.policy = LSTMPolicy(env.observation_space.shape, num_actions, actor_id) self.runner = RunnerThread(env, self.policy, 20) self.env = env self.logdir = logdir if start: self.start()
def run(args, server): env = create_env(args.env_id, client_id=str(args.task), remotes=args.remotes, num_trials=args.num_trials) trainer = A3C(env, args.task, args.visualise, args.meta, args.remotes, args.num_trials) # log, checkpoints et tensorboard # (Original Comment) Variable names that start with "local" are not saved in checkpoints. if use_tf12_api: variables_to_save = [v for v in tf.global_variables() if not v.name.startswith("local")] init_op = tf.variables_initializer(variables_to_save) init_all_op = tf.global_variables_initializer() else: variables_to_save = [v for v in tf.all_variables() if not v.name.startswith("local")] init_op = tf.initialize_variables(variables_to_save) init_all_op = tf.initialize_all_variables() saver = FastSaver(variables_to_save) var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) logger.info('Trainable vars:') for v in var_list: logger.info(' %s %s', v.name, v.get_shape()) def init_fn(ses): logger.info("Initializing all parameters.") ses.run(init_all_op) config = tf.ConfigProto(device_filters=["/job:ps", "/job:worker/task:{}/cpu:0".format(args.task)]) logdir = os.path.join(args.log_dir, 'train') if use_tf12_api: summary_writer = tf.summary.FileWriter(logdir + "_%d" % args.task) else: summary_writer = tf.train.SummaryWriter(logdir + "_%d" % args.task) logger.info("Events directory: %s_%s", logdir, args.task) # The tf.train.Supervisor provides a set of services that helps implement a robust training process. *(4) sv = tf.train.Supervisor(is_chief=(args.task == 0), logdir=logdir, saver=saver, summary_op=None, init_op=init_op, init_fn=init_fn, summary_writer=summary_writer, ready_op=tf.report_uninitialized_variables(variables_to_save), global_step=trainer.global_step, save_model_secs=30, save_summaries_secs=30) if args.test: # testing phase run_test(trainer, sv, config, summary_writer, server) else: # training phase run_train(trainer, sv, config, summary_writer, server)
def main(): env = envs.create_env(None) act = deepq.load("{}_model.pkl".format(envs.VSTR)) while True: obs, done = env.reset(), False episode_rew = 0 while not done: env.render() obs, rew, done, _ = env.step(act([obs])[0]) episode_rew += rew print("Episode reward", episode_rew)
def play(args, server): env = create_env(args.env_id, client_id=str(args.task), remotes=args.remotes) trainer = A3C(env, args.task, args.visualise) result = [] """ implement your code here Condition: The purpose of this function is for testing The number of episodes is 20 you have to return the mean value of rewards of 20 episodes """ return np.mean(result)
def main(env, snapshot, visualise): env = create_env(env, client_id=0, remotes=1) with tf.variable_scope("global"): policy = LSTMPolicy(env.observation_space.shape, env.action_space.n) last_state = env.reset() # state = last_state last_features = policy.get_initial_features() length = 0 rewards = 0 variables_to_save = [ v for v in tf.global_variables() if not v.name.startswith("local") ] saver = tf.train.Saver(variables_to_save) with tf.Session() as sess: # Restore variables from disk. # saver.restore(sess, "train/model.ckpt-361814.data-00000-of-00001") # saver.restore(sess, "train/model.ckpt-361814") # saver.restore(sess, "/tmp/neonrace/train/model.ckpt-361714") saver.restore(sess, snapshot) while True: terminal_end = False fetched = policy.act(last_state, *last_features) action, value_, features = fetched[0], fetched[1], fetched[2:] # state, reward, terminal, info = env.step(action.argmax()) action_n = action.argmax() # state, reward, terminal, info = env.step(default_action) state, reward, terminal, info = env.step(action_n) if visualise: env.render() # env.render() # I need to visualize it during testing print 'length: %d, rewards: %f' % (length, rewards) length += 1 rewards += reward last_state = state last_features = features if terminal: terminal_end = True print("Episode finished. Sum of rewards: %d. Length: %d" % (rewards, length)) length = 0 rewards = 0 break
def main(): env = envs.create_env(None) model = models.mlp([64]) act = simple.learn( env, q_func=model, lr=1e-3, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.01, exploration_final_eps=0.0, print_freq=10, callback=callback, prioritized_replay=True ) print("Saving model to {}_model.pkl".format(envs.VSTR)) act.save("{}_model.pkl".format(envs.VSTR))
def train(num_workers, env_name="PongDeterministic-v3"): env = create_env(env_name, None, None) policy = LSTMPolicy(env.observation_space.shape, env.action_space.n, 0) agents = [Runner(env_name, i) for i in range(num_workers)] parameters = policy.get_weights() gradient_list = [agent.compute_gradient(parameters) for agent in agents] steps = 0 obs = 0 while True: done_id, gradient_list = ray.wait(gradient_list) gradient, info = ray.get(done_id)[0] policy.model_update(gradient) parameters = policy.get_weights() steps += 1 obs += info["size"] gradient_list.extend([agents[info["id"]].compute_gradient(parameters)]) return policy
def train(num_workers, env_name="PongDeterministic-v3"): env = create_env(env_name) ps = ParameterServer(env) parameters = ps.get_weights() agents = [Runner.remote(env_name, i) for i in range(num_workers)] delta_list = [agent.get_delta.remote(parameters) for agent in agents] steps = 0 obs = 0 timing = [] for i in range(2000): done_id, delta_list = ray.wait(delta_list) delta, info = ray.get(done_id)[0] ps.add_delta(delta) parameters = ps.weights obs += info["size"] delta_list.extend( [agents[info["id"]].compute_gradient.remote(parameters)]) return policy
def evaluate(constants): env, action_space = create_env(constants.env) agent = A3CAgent(action_space) worker = Worker(env.proxy, agent, GLOBAL_SCOPE, constants) worker.build_rollout() sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) # TODO: Model loading for A2C. if FLAGS.load_model: vars_to_save_load = tf.trainable_variables(GLOBAL_SCOPE) saver = tf.train.Saver(vars_to_save_load) worker.load_model(sess, saver, constants.model_directory) worker.evaluate(sess) env.close() sess.close()
def async_train(args, make_model, train): setproctitle('{}:main'.format(args.name)) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) env = create_env(args.game_type, args.env_name, 'main', 1) shared_model = make_model(env.observation_space.shape[0], env.action_space.n) shared_model.share_memory() if args.no_shared_optimizer: optimizer = None else: optimizer = SharedAdam(shared_model.parameters(), lr=args.lr) optimizer.share_memory() mp.set_start_method('spawn') global_steps = mp.Value('L', 0) processes = [] processes.append( mp.Process(target=test, args=(shared_model, global_steps, args))) if not args.no_render: processes.append( mp.Process(target=show, args=(shared_model, global_steps, args))) for rank in range(args.n_processes): processes.append( mp.Process(target=train, args=(shared_model, optimizer, rank, global_steps, args))) for p in processes: p.start() time.sleep(0.1) for p in processes: p.join() print('Main process finished !!!')
def test_penalty_env(env): import envs env = envs.create_env("Pong", location="bottom", catastrophe_type="1", classifier_file=save_classifier_path + '/0/final.ckpt') import matplotlib.pyplot as plt observation = env.reset() for _ in range(20): action = env.action_space.sample() observation, reward, done, info = env.step(action) plt.imshow(observation[:, :, 0]) plt.show() print('Cat: ', info['frame/is_catastrophe']) print('reward: ', reward) if done: break
def main(): args = TrainOptions().parse() device = torch.device('cuda') if (not args.no_cuda and torch.cuda.is_available()) else torch.device('cpu') if not os.path.exists(args.output): os.makedirs(args.output) env = create_env(args) network = create_network(args, env.action_space.n, env.observation_space.shape) network.to(device) optimizer = Adam(network.parameters(), lr=args.lr) policy = AnnealedEpsilonGreedyPolicy(epsilon_max=args.epsilon_max, epsilon_min=args.epsilon_min, exploration_steps=args.exp_steps) memory = SimpleExperienceReplay(max_size=args.mem_max, batch_size=args.batch_size) logger = Logger() agent = create_agent(args, env, network, policy, memory, optimizer, logger) # train agent agent.learn(n_episodes=args.n_ep, ep_max_step=args.ep_max_step, replay_start_size=args.replay_start, save_every=args.freq_save_model, update_target_every=args.freq_target_update, render_every=args.freq_render)
def main(args): env_id = args.env_id max_episodes = args.max_episodes ckpt_dir = args.ckpt_dir output_dir = args.output_dir # env env = create_env(env_id, 0, 1) if len(output_dir) > 0: env = wrappers.Monitor(env, output_dir) if args.render: env.render() # work-around to the nasty env.render() failing issue when working with tensorflow # see https://github.com/openai/gym/issues/418 import tensorflow as tf from model import Convx2LSTMActorCritic # model sess = tf.Session() with tf.variable_scope("global"): network = Convx2LSTMActorCritic(env.observation_space.shape, env.action_space.n) init = tf.global_variables_initializer() sess.run(init) # load model parameters checkpoint = tf.train.get_checkpoint_state(ckpt_dir) if checkpoint and checkpoint.model_checkpoint_path: saver = tf.train.Saver() saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: raise Exception('cannot find checkpoint path') # run evaluating with sess.as_default(): evaluate_loop(env, network, max_episodes, args)
def evaluate_main(env_id, model_id, max_episodes, ckpt_dir, output_dir, sleep_time, render, verbose, with_global_step=False): # env env = create_env(env_id, 0, 1) if len(output_dir) > 0: # output recording env = wrappers.Monitor(env, output_dir) if render: env.render() is_obs_tuple = isinstance(env.observation_space, Tuple) observation_shape = [sp.shape for sp in env.observation_space.spaces] if is_obs_tuple \ else [env.observation_space.shape] action_shape = [env.action_space.n] if isinstance(env.action_space, spaces.Discrete) \ else [env.action_space.shape[0]] # work-around to the nasty env.render() failing issue when working with tensorflow # see https://github.com/openai/gym/issues/418 import tensorflow as tf from model import create_model use_tf_0_12_api = distutils.version.LooseVersion(tf.VERSION) >= distutils.version.LooseVersion('0.12.0') and \ distutils.version.LooseVersion(tf.VERSION) <= distutils.version.LooseVersion('0.12.1') use_tf_1_1_api = distutils.version.LooseVersion( tf.VERSION) == distutils.version.LooseVersion('1.1.0') # model tf.reset_default_graph() sess = tf.Session() with tf.variable_scope("global"): network = create_model(model_id, *observation_shape + action_shape) if (with_global_step): global_step = tf.get_variable("global_step", [], tf.int32, initializer=tf.constant_initializer( 0, dtype=tf.int32), trainable=False) init = tf.global_variables_initializer() sess.run(init) # load model parameters checkpoint = tf.train.get_checkpoint_state(ckpt_dir) if checkpoint and checkpoint.model_checkpoint_path: restore_tf_0_12_model = False restore_tf_1_1_model = False reader = tf.train.NewCheckpointReader(checkpoint.model_checkpoint_path) for var_name in reader.get_variable_to_shape_map(): if 'RNN/BasicLSTMCell/Linear' in var_name: restore_tf_0_12_model = True break elif 'rnn/basic_lstm_cell/' in var_name: restore_tf_1_1_model = True break if use_tf_1_1_api and restore_tf_0_12_model: var_dict = {} for var in tf.global_variables(): name = var.name.split(':')[0] if 'rnn/basic_lstm_cell/weights' in name: name = name.replace('rnn/basic_lstm_cell/weights', 'RNN/BasicLSTMCell/Linear/Matrix') elif 'rnn/basic_lstm_cell/biases' in name: name = name.replace('rnn/basic_lstm_cell/biases', 'RNN/BasicLSTMCell/Linear/Bias') var_dict[name] = var saver = tf.train.Saver(var_dict) elif use_tf_0_12_api and restore_tf_1_1_model: var_dict = {} for var in tf.global_variables(): name = var.name.split(':')[0] if 'RNN/BasicLSTMCell/Linear/Matrix' in name: name = name.replace('RNN/BasicLSTMCell/Linear/Matrix', 'rnn/basic_lstm_cell/weights') elif 'RNN/BasicLSTMCell/Linear/Bias' in name: name = name.replace('RNN/BasicLSTMCell/Linear/Bias', 'rnn/basic_lstm_cell/biases') var_dict[name] = var saver = tf.train.Saver(var_dict) else: saver = tf.train.Saver() saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: raise Exception('cannot find checkpoint path') # run evaluating with sess.as_default(): ret = evaluate_loop(env, network, max_episodes, sleep_time, render, verbose) env.close() if (with_global_step): global_step_result = sess.run(global_step) sess.close() if (with_global_step): return ret, global_step_result else: return ret
def generate_data(rank, args, start, end): from envs import create_env, set_seed, get_obs from model import R_Module import torch print(rank, "started") env = create_env(args.env_name, framework=args.framework, args=args) env = set_seed(args.seed + rank, env, args.framework) state = get_obs(env, args.framework) if args.from_policy is not None: model_state, r_args = torch.load(args.from_policy) policy = R_Module(env.action_space.shape[0], r_args.dim, discrete=r_args.discrete, baseline=r_args.baseline, state_space=env.observation_space.shape[0]) policy.load_state_dict(model_state) policy.eval() states = [] actions = [] i = start done = False while i < end: if i % 100 == 0: print(rank, i) ep_states = [] ep_actions = [] if args.from_policy is not None: cx_p = Variable(torch.zeros(1, r_args.dim)) hx_p = Variable(torch.zeros(1, r_args.dim)) for j in range(args.rollout): if args.from_policy is not None: value, logit, (hx_p, cx_p) = policy(state.unsqueeze(0), (hx_p, cx_p)) a, _, _ = get_action(logit, r_args.discrete) else: a = env.action_space.sample() ep_actions.append(a) state = get_obs(env, args.framework) env.step(a) if args.render: env.render() ep_states.append(state) final_state = get_obs(env, args.framework) ep_states.append(final_state) states.append(ep_states) actions.append(ep_actions) i += 1 # reset the environment here if done or args.reset: env.reset() done = False torch.save((states, actions), os.path.join(args.out_dir, 'states_actions_%s_%s.pt' % (start, end)))
import torch.multiprocessing as mp mp.set_start_method('spawn') from torch.autograd import Variable from envs import create_env, set_seed, get_obs from model import R_Module os.environ['OMP_NUM_THREADS'] = '1' args = parser.parse_args() env_name = args.env_name env_name += '_rollout%s' % args.rollout if args.env_name.endswith('MazeEnv'): env_name += 'mazeid%slength%s' % (args.maze_id, args.maze_length) if args.single_env and args.maze_id == -1: env = create_env(args.env_name, framework=args.framework, args=args) env_name += '_single_env' args.maze_structure = env._env.MAZE_STRUCTURE if args.random_start: env_name += '_randomstart' if args.file_path is not None: env_name += '_transfer' if args.framework == 'mazebase': env_name += '_rollout_%s_length_%s' % (args.rollout, args.maze_length) args.out_dir = os.path.join(args.out, env_name) print(args) print(args.out_dir) os.makedirs(args.out_dir, exist_ok=True) processes = [] block = int(args.N / args.num_processes)
def run(args, server): env = create_env(args.env_id, client_id=str(args.task), remotes=args.remotes, envWrap=args.envWrap, designHead=args.designHead, noLifeReward=args.noLifeReward) trainer = A3C(env, args.task, args.visualise, args.unsup, args.envWrap, args.designHead, args.noReward) # logging if args.task == 0: with open(args.log_dir + '/log.txt', 'w') as fid: for key, val in constants.items(): fid.write('%s: %s\n'%(str(key), str(val))) fid.write('designHead: %s\n'%args.designHead) fid.write('input observation: %s\n'%str(env.observation_space.shape)) fid.write('env name: %s\n'%str(env.spec.id)) fid.write('unsup method type: %s\n'%str(args.unsup)) # Variable names that start with "local" are not saved in checkpoints. if use_tf12_api: variables_to_save = [v for v in tf.global_variables() if not v.name.startswith("local")] init_op = tf.variables_initializer(variables_to_save) init_all_op = tf.global_variables_initializer() else: variables_to_save = [v for v in tf.all_variables() if not v.name.startswith("local")] init_op = tf.initialize_variables(variables_to_save) init_all_op = tf.initialize_all_variables() saver = FastSaver(variables_to_save) if args.pretrain is not None: variables_to_restore = [v for v in tf.trainable_variables() if not v.name.startswith("local")] pretrain_saver = FastSaver(variables_to_restore) var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) logger.info('Trainable vars:') for v in var_list: logger.info(' %s %s', v.name, v.get_shape()) def init_fn(ses): logger.info("Initializing all parameters.") ses.run(init_all_op) if args.pretrain is not None: pretrain = tf.train.latest_checkpoint(args.pretrain) logger.info("==> Restoring from given pretrained checkpoint.") logger.info(" Pretraining address: %s", pretrain) pretrain_saver.restore(ses, pretrain) logger.info("==> Done restoring model! Restored %d variables.", len(variables_to_restore)) config = tf.ConfigProto(device_filters=["/job:ps", "/job:worker/task:{}/cpu:0".format(args.task)]) logdir = os.path.join(args.log_dir, 'train') if use_tf12_api: summary_writer = tf.summary.FileWriter(logdir + "_%d" % args.task) else: summary_writer = tf.train.SummaryWriter(logdir + "_%d" % args.task) logger.info("Events directory: %s_%s", logdir, args.task) sv = tf.train.Supervisor(is_chief=(args.task == 0), logdir=logdir, saver=saver, summary_op=None, init_op=init_op, init_fn=init_fn, summary_writer=summary_writer, ready_op=tf.report_uninitialized_variables(variables_to_save), global_step=trainer.global_step, save_model_secs=30, save_summaries_secs=30) num_global_steps = constants['MAX_GLOBAL_STEPS'] logger.info( "Starting session. If this hangs, we're mostly likely waiting to connect to the parameter server. " + "One common cause is that the parameter server DNS name isn't resolving yet, or is misspecified.") with sv.managed_session(server.target, config=config) as sess, sess.as_default(): # Workaround for FailedPreconditionError # see: https://github.com/openai/universe-starter-agent/issues/44 and 31 sess.run(trainer.sync) trainer.start(sess, summary_writer) global_step = sess.run(trainer.global_step) logger.info("Starting training at gobal_step=%d", global_step) while not sv.should_stop() and (not num_global_steps or global_step < num_global_steps): trainer.process(sess) global_step = sess.run(trainer.global_step) # Ask for all the services to stop. sv.stop() logger.info('reached %s steps. worker stopped.', global_step)
def inference(args): """ It only restores LSTMPolicy architecture, and does inference using that. """ # get address of checkpoints indir = os.path.join(args.log_dir, 'train') outdir = os.path.join( args.log_dir, 'inference') if args.out_dir is None else args.out_dir with open(indir + '/checkpoint', 'r') as f: first_line = f.readline().strip() ckpt = first_line.split(' ')[-1].split('/')[-1][:-1] ckpt = ckpt.split('-')[-1] ckpt = indir + '/model.ckpt-' + ckpt # define environment if args.record: env = create_env(args.env_id, client_id='0', remotes=None, envWrap=args.envWrap, designHead=args.designHead, record=True, noop=args.noop, acRepeat=args.acRepeat, outdir=outdir) else: env = create_env(args.env_id, client_id='0', remotes=None, envWrap=args.envWrap, designHead=args.designHead, record=True, noop=args.noop, acRepeat=args.acRepeat) numaction = env.action_space.n with tf.device("/cpu:0"): # define policy network with tf.variable_scope("global"): policy = LSTMPolicy(env.observation_space.shape, numaction, args.designHead) policy.global_step = tf.get_variable( "global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False) # Variable names that start with "local" are not saved in checkpoints. if use_tf12_api: variables_to_restore = [ v for v in tf.global_variables() if not v.name.startswith("local") ] init_all_op = tf.global_variables_initializer() else: variables_to_restore = [ v for v in tf.all_variables() if not v.name.startswith("local") ] init_all_op = tf.initialize_all_variables() saver = FastSaver(variables_to_restore) # print trainable variables var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) logger.info('Trainable vars:') for v in var_list: logger.info(' %s %s', v.name, v.get_shape()) # summary of rewards action_writers = [] if use_tf12_api: summary_writer = tf.summary.FileWriter(outdir) for ac_id in range(numaction): action_writers.append( tf.summary.FileWriter( os.path.join(outdir, 'action_{}'.format(ac_id)))) else: summary_writer = tf.train.SummaryWriter(outdir) for ac_id in range(numaction): action_writers.append( tf.train.SummaryWriter( os.path.join(outdir, 'action_{}'.format(ac_id)))) logger.info("Inference events directory: %s", outdir) config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) with tf.Session(config=config) as sess: logger.info("Initializing all parameters.") sess.run(init_all_op) logger.info("Restoring trainable global parameters.") saver.restore(sess, ckpt) logger.info("Restored model was trained for %.2fM global steps", sess.run(policy.global_step) / 1000000.) #saving with meta graph: metaSaver = tf.train.Saver(variables_to_restore) metaSaver.save( sess, '/home/swagking0/noreward-rl/models/models_me/mario_me') last_state = env.reset() if args.render or args.record: env.render() last_features = policy.get_initial_features() # reset lstm memory length = 0 rewards = 0 mario_distances = np.zeros((args.num_episodes, )) for i in range(args.num_episodes): print("Starting episode %d" % (i + 1)) if args.recordSignal: from PIL import Image signalCount = 1 utils.mkdir_p(outdir + '/recordedSignal/ep_%02d/' % i) Image.fromarray( (255 * last_state[..., -1]).astype('uint8')).save( outdir + '/recordedSignal/ep_%02d/%06d.jpg' % (i, signalCount)) if args.random: print('I am random policy!') else: if args.greedy: print('I am greedy policy!') else: print('I am sampled policy!') while True: # run policy fetched = policy.act_inference(last_state, *last_features) prob_action, action, value_, features = fetched[ 0], fetched[1], fetched[2], fetched[3:] # run environment: sampled one-hot 'action' (not greedy) if args.random: stepAct = np.random.randint(0, numaction) # random policy else: if args.greedy: stepAct = prob_action.argmax() # greedy policy else: stepAct = action.argmax() # print(stepAct, prob_action.argmax(), prob_action) state, reward, terminal, info = env.step(stepAct) # update stats length += 1 rewards += reward last_state = state last_features = features if args.render or args.record: env.render() if args.recordSignal: signalCount += 1 Image.fromarray( (255 * last_state[..., -1]).astype('uint8')).save( outdir + '/recordedSignal/ep_%02d/%06d.jpg' % (i, signalCount)) # store summary summary = tf.Summary() summary.value.add(tag='ep_{}/reward'.format(i), simple_value=reward) summary.value.add(tag='ep_{}/netreward'.format(i), simple_value=rewards) summary.value.add(tag='ep_{}/value'.format(i), simple_value=float(value_[0])) if 'NoFrameskip-v' in args.env_id: # atari summary.value.add( tag='ep_{}/lives'.format(i), simple_value=env.unwrapped.ale.lives()) summary_writer.add_summary(summary, length) summary_writer.flush() summary = tf.Summary() for ac_id in range(numaction): summary.value.add(tag='action_prob', simple_value=float( prob_action[ac_id])) action_writers[ac_id].add_summary(summary, length) action_writers[ac_id].flush() timestep_limit = env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') if timestep_limit is None: timestep_limit = env.spec.timestep_limit if terminal or length >= timestep_limit: if length >= timestep_limit or not env.metadata.get( 'semantics.autoreset'): last_state = env.reset() last_features = policy.get_initial_features( ) # reset lstm memory print( "Episode finished. Sum of rewards: %.2f. Length: %d." % (rewards, length)) if 'distance' in info: print('Mario Distance Covered:', info['distance']) mario_distances[i] = info['distance'] length = 0 rewards = 0 if args.render or args.record: env.render() if args.recordSignal: signalCount += 1 Image.fromarray( (255 * last_state[..., -1]).astype('uint8')).save( outdir + '/recordedSignal/ep_%02d/%06d.jpg' % (i, signalCount)) break logger.info('Finished %d true episodes.', args.num_episodes) if 'distance' in info: print('Mario Distances:', mario_distances) np.save(outdir + '/distances.npy', mario_distances) env.close()
def train(shared_model, shared_optimizer, rank, args, info): env = create_env() # make a local (unshared) environment torch.manual_seed(args.seed + rank) # seed everything model = NNPolicy(channels=1, memsize=args.hidden, num_actions=args.num_actions).to( device=args.device) # a local/unshared model state = torch.tensor(prepro(env.reset())).to( device=args.device) # get first state start_time = last_disp_time = time.time() episode_length, epr, eploss, done = 0, 0, 0, True # bookkeeping steps_done = 1 while info['frames'][ 0] <= 8e8 or args.test: # openai baselines uses 40M frames...we'll use 80M model.load_state_dict( shared_model.state_dict()) # sync with shared model # hx = torch.zeros(1, args.hidden) if done else hx.detach() # rnn activation vector # hx = torch.randn(1, args.hidden) if done else hx.detach() hx = torch.randn(1, args.hidden) if done else hx.detach() values, logps, actions, rewards = [], [], [], [ ] # save values for computing gradientss for step in range(args.rnn_steps): # for step in range(0, np.random.randint(10, 40)): episode_length += 1 value, logit, hx = model( (state.view(1, 1, 160, 160), hx.to(device=args.device))) logp = F.log_softmax(logit, dim=-1) action = torch.exp(logp).multinomial( num_samples=1).data[0] #logp.max(1)[1].data if args.test else state, reward, done = env.step( action.cpu().item()) # action.cpu().numpy()[0] state = torch.tensor(prepro(state)).to(args.device) epr += reward # reward = np.clip(reward, -1, 1) # reward done = done or episode_length >= 1e4 # don't playing one ep for too long info['frames'].add_(1) num_frames = int(info['frames'].item()) if num_frames % 1e6 == 0: # save every 2M frames torch.save( shared_model.state_dict(), args.save_dir + 'model.{:.0f}.tar'.format(num_frames / 1e6)) printlog( args, '\n\t{:.0f}M frames: saved model\n'.format( num_frames / 1e6)) if done: # update shared data info['episodes'] += 1 interp = 1 if info['episodes'][0] == 1 else 1 - args.horizon info['run_epr'].mul_(1 - interp).add_(interp * epr) info['run_loss'].mul_(1 - interp).add_(interp * eploss) if rank == 0 and time.time( ) - last_disp_time > 60: # print info ~ every minute elapsed = time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)) printlog2( args, info['run_epr'].item(), info['run_loss'].item(), num_frames, 'time {}, episodes {:.0f}, frames {:.1f}M, mean epr {:.2f}, run loss {:.2f}' .format(elapsed, info['episodes'].item(), num_frames / 1e6, info['run_epr'].item(), info['run_loss'].item())) last_disp_time = time.time() if done: # maybe print info. # reward = 500 episode_length, epr, eploss = 0, 0, 0 state = torch.tensor(prepro(env.reset())).to(args.device) values.append(value) logps.append(logp) actions.append(action) rewards.append(reward) next_value = torch.zeros(1, 1).to( device=args.device) if done else model( (state.view(1, 1, 160, 160), hx))[0] values.append(next_value.detach()) re = np.asarray(rewards) # + 1 _n = LA.norm(re) # print(np.nan_to_num((re / _n))) # loss = cost_func(args, torch.cat(values).cpu(), torch.cat(logps).cpu(), torch.cat(actions).cpu(), torch.from_numpy(np.asarray(rewards))) # loss = cost_func(args, torch.cat(values).cpu(), torch.cat(logps).cpu(), torch.cat(actions).cpu(), torch.from_numpy(np.asarray(rewards)).float().to(args.device)) loss = cost_func(args, torch.cat(values).cpu(), torch.cat(logps).cpu(), torch.cat(actions).cpu(), torch.from_numpy(np.nan_to_num((re / _n))).cpu()) eploss += loss.item() shared_optimizer.zero_grad() loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), 40) for param, shared_param in zip(model.parameters(), shared_model.parameters()): if shared_param.grad is None: shared_param._grad = param.grad # sync gradients with shared model shared_optimizer.step()
def run(args, server): env = create_env(args.env_id, client_id=str(args.task), remotes=args.remotes) if args.teacher: teacher = model.LSTMPolicy(env.observation_space.shape, env.action_space.n, name="global") teacher_init_op = teacher.load_model_from_checkpoint(args.checkpoint_path) trainer = A3C(env, args.task, args.visualise, teacher= teacher, name="student") else: teacher = None trainer = A3C(env, args.task, args.visualise, teacher= teacher) # Variable names that start with "local" are not saved in checkpoints. if use_tf12_api: variables_to_save = trainer.global_var_list all_trainable_variables = [v for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) if trainer.scope in v.name] init_op = tf.variables_initializer(variables_to_save) init_all_op = tf.variables_initializer(all_student_variables) else: variables_to_save = trainer.global_var_list init_op = tf.initialize_variables(variables_to_save) all_trainable_variables = [v for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) if trainer.scope in v.name] init_all_op = tf.variables_initializer(all_student_variables) saver = FastSaver(variables_to_save) logger.info('Trainable vars:') for v in all_trainable_variables: logger.info('{} {}'.format(v.name, v.get_shape())) def init_fn(ses): logger.info("Initializing all parameters.") ses.run([init_all_op]) def get_init_fn(): if args.teacher: return tf.contrib.framework.assign_from_checkpoint_fn( args.checkpoint_path, teacher.var_list, ignore_missing_vars=True) else: return lambda sess: init_fn(sess) config = tf.ConfigProto(device_filters=["/job:ps", "/job:worker/task:{}/cpu:0".format(args.task)]) logdir = os.path.join(args.log_dir, 'train') if use_tf12_api: summary_writer = tf.summary.FileWriter(logdir + "_{}".format(args.task)) else: summary_writer = tf.train.SummaryWriter(logdir + "_'{}".format(args.task)) logger.info("Events directory: {}_{}".format(logdir, args.task)) sv = tf.train.Supervisor(is_chief=(args.task == 0), logdir=logdir, saver=saver, summary_op=None, init_op=init_op, init_fn=get_init_fn(), summary_writer=summary_writer, ready_op=tf.report_uninitialized_variables(variables_to_save), global_step=trainer.global_step, save_model_secs=30, save_summaries_secs=30) num_global_steps = 100000000 logger.info( "Starting session. If this hangs, we're mostly likely waiting to connect to the parameter server. " + "One common cause is that the parameter server DNS name isn't resolving yet, or is misspecified.") with sv.managed_session(server.target, config=config) as sess, sess.as_default(): sess.run(trainer.sync) trainer.start(sess, summary_writer) global_step = sess.run(trainer.global_step) logger.info("Starting training at step={}".format(global_step)) while not sv.should_stop() and (not num_global_steps or global_step < num_global_steps): trainer.process(sess) global_step = sess.run(trainer.global_step) # Ask for all the services to stop. sv.stop() logger.info('reached {} steps. worker stopped.'.format(global_step))
def run(env_name, version, act_rep, max_steps, rollout_agent_name, behavior_agent_name, eps_greedy, sim_steps, search_horizont, gamma=1., exploration=1., prune_tree=False, report_freq=100, n_runs=1, save_dir=None, save_freq=10, process=0): def save_data(): if save_dir is not None and len(frames) > 0: run_data = { 'frames': frames, 'actions': actions, 'reward': total_reward, 'action_visits': action_visits, 'action_values': action_values, 'rewards': rewards, 'action_meanings': env.env.get_action_meanings(), } fname = os.path.join(save_dir, 'run_process_{}_run_{}_steps_{}.pkl'.format(process, n_run, step)) with open(fname, 'wb') as f: cPickle.dump(run_data, f, -1) del actions[:] del frames[:] del action_visits[:] del action_values[:] del rewards[:] env = create_env(env_name, version, act_rep) uct.Node.n_actions = env.action_space.n # agent for rollouts if rollout_agent_name == 'random' or rollout_agent_name is None: rollout_agent = RandomAgent(env.action_space.n) else: rollout_agent = KerasAgent(rollout_agent_name) # agent for action selections if behavior_agent_name == 'random': behavior_agent = RandomAgent(env.action_space.n) elif behavior_agent_name == 'uct' or behavior_agent_name is None: behavior_agent = 'uct' else: behavior_agent = KerasAgent(behavior_agent_name) if save_dir is not None: actions = [] frames = [] action_visits = [] action_values = [] rewards = [] for n_run in xrange(n_runs): terminal = False env.reset() _frame = env.env._get_image() node = uct.Node(env.clone_state()) total_reward = 0 step = 0 t_start = t0 = time() while not terminal: # choose uct action a_uct = uct.uct_action(env, rollout_agent, node, sim_steps, search_horizont, gamma, exploration) # choose action in environment if np.random.rand() < eps_greedy: a = env.action_space.sample() elif behavior_agent == 'uct': a = a_uct else: a = behavior_agent.choose_action(_frame) if save_dir is not None: actions.append(a_uct) frames.append(_frame) action_visits.append(node.a_visits) action_values.append(node.a_values) # do step in environment env.restore_state(node.state) frame, reward, terminal, _ = env.step(a) _frame = env.env._get_image() if save_dir is not None: rewards.append(reward) # create new tree or try to use old tree if prune_tree: if frame in node.childs[a]: node = node.childs[a][frame] node.parent = None else: node = uct.Node(env.clone_state()) else: node = uct.Node(env.clone_state()) total_reward += reward step += 1 # report progress if step % report_freq == 0: print 'process: {} run: {}, steps: {}, time: {:.2f}, total reward: {:.2f}'.\ format(process, n_run+1, step, time() - t0, total_reward) t0 = time() # save intermediate result if step % save_freq == 0: save_data() if 0 < max_steps < step: break print '\nprocess: {}, run: {}, total steps: {}, total time: {:.2f}, total reward: {:.2f}'.\ format(process, n_run+1, step, time() - t_start, total_reward) # save last chunk of data save_data() env.close()
def inference(args): """ It restore policy weights, and does inference. """ # virtual display (headless remotes) virtual_display = Display(visible=0, size=(1400, 900)) virtual_display.start() # define environment env = create_env(args.env_id, client_id='0', remotes=None, envWrap=True, acRepeat=1, record=args.record, outdir=args.outdir) num_actions = env.action_space.n with tf.device("/cpu:0"): config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) with tf.Session(config=config) as sess: logger.info("Restoring trainable global parameters.") saver = tf.train.import_meta_graph(args.ckpt+'.meta') saver.restore(sess, args.ckpt) probs = tf.get_collection("probs")[0] sample = tf.get_collection("sample")[0] vf = tf.get_collection("vf")[0] state_out_0 = tf.get_collection("state_out_0")[0] state_out_1 = tf.get_collection("state_out_1")[0] last_state = env.reset() if args.render or args.record: env.render() last_features = np.zeros((1, 256), np.float32); last_features = [last_features, last_features] length = 0 rewards = 0 mario_distances = np.zeros((args.num_episodes,)) for i in range(args.num_episodes): print("Starting episode %d" % (i + 1)) if args.random: print('I am a random policy!') else: if args.greedy: print('I am a greedy policy!') else: print('I am a sampled policy!') while True: # run policy fetched = sess.run([probs, sample, vf, state_out_0, state_out_1] , {"global/x:0": [last_state], "global/c_in:0": last_features[0], "global/h_in:0": last_features[1]}) prob_action, action, value_, features = fetched[0], fetched[1], fetched[2], fetched[3:] # run environment if args.random: stepAct = np.random.randint(0, num_actions) # random policy else: if args.greedy: stepAct = prob_action.argmax() # greedy policy else: stepAct = action.argmax() state, reward, terminal, info = env.step(stepAct) # update stats length += 1 rewards += reward last_state = state last_features = features if args.render or args.record: env.render(mode='rgb_array') # set to rgb_array by default (assumes running on a headless remote) timestep_limit = env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps') if timestep_limit is None: timestep_limit = env.spec.timestep_limit if terminal or length >= timestep_limit: if length >= timestep_limit or not env.metadata.get('semantics.autoreset'): last_state = env.reset() last_features = np.zeros((1, 256), np.float32); last_features = [last_features, last_features] print("Episode finished. Sum of rewards: %.2f. Length: %d." % (rewards, length)) length = 0 rewards = 0 if args.render or args.record: env.render(mode='rgb_array') break logger.info('Finished %d true episodes.', args.num_episodes) env.close()
parser.add_argument('--no-shared', default=False, help='use an optimizer without shared momentum.') parser.add_argument('--memsize', type=int, default=256) parser.add_argument('--device', type=str, default='cuda') if __name__ == '__main__': # os.environ['OMP_NUM_THREADS'] = '1' # os.environ['CUDA_VISIBLE_DEVICES'] = "0" args = parser.parse_args() args.device = torch.device(args.device) args.env_name = 'FlappyBird-v0' env = create_env() shared_model = ActorCritic(1, args.memsize, len(env.action_space)) shared_model.share_memory() if args.no_shared: optimizer = None else: optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=args.lr) optimizer.share_memory() processes = [] # counter = mp.Value('i', 0) # lock = mp.Lock()