def create_training_thread(training, delay_delta): # print("Oh yeah, creating thraining thread!!!") logging.info(" ".join(map(str, ("training", training)))) global global_t, global_thread_index, wall_t, sess if len(idle_threads) == 0: logging.info(" ".join( map(str, ("Creating new thread", global_thread_index)))) if cooperative: created_thread = A3CTrainingThread( global_thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, MAX_TIME_STEP, device, training, cooperative, delay_delta) else: created_thread = A3CTrainingThread( global_thread_index, global_network[-global_thread_index], initial_learning_rate, learning_rate_input, grad_applier, MAX_TIME_STEP, device, training, cooperative, delay_delta) training_threads[global_thread_index] = created_thread return_index = global_thread_index global_thread_index += 1 initialize_uninitialized(sess) else: return_index = idle_threads.pop() logging.info(" ".join(map(str, ("Recycling thread", return_index)))) created_thread = training_threads[return_index] # set start time # start_time = time.time() - wall_t # created_thread.set_start_time(start_time) created_thread.episode_count = 0 created_thread.time_differences = [] created_thread.windows = [] created_thread.states = [] created_thread.actions = [] created_thread.ticknos = [] created_thread.rewards = [] created_thread.values = [] created_thread.estimated_values = [] created_thread.start_lstm_states = [] created_thread.variable_snapshots = [] created_thread.local_t = 0 created_thread.episode_reward_throughput = 0 created_thread.episode_reward_delay = 0 created_thread.episode_reward_sent = 0 sess.run(created_thread.sync) created_thread.reset_state_and_reinitialize(sess) return return_index
def config(self): initial_learning_rate = FLAGS.init_lr learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=FLAGS.rmsp_alpha, momentum=0.0, epsilon=FLAGS.rmsp_epsilon, clip_norm=FLAGS.grad_norm_clip, device=constants.device) #Network will creat PNN self.global_network = Network( name="core_{}".format(constants.task_name)) IPython.embed() print("CREATING AGENTS") for i in range(FLAGS.threads): training_thread = A3CTrainingThread(i, self.global_network, initial_learning_rate, learning_rate_input, grad_applier, FLAGS.global_t_max, device=constants.device, sess=self.sess, name="agent_{}_{}".format( constants.task_name, i)) self.training_threads.append(training_thread) init = tf.global_variables_initializer() self.sess.run(init)
def make_thread_obj(index): env = make_unity_env(i, unity_baseport=unity_baseport) # horrible naming here do to lack of full encapsulation of thread operations in the so-named thread class # TODO: when refactoring, move thread management and loop state in to the class obj = A3CTrainingThread(i, global_network, args.initial_learning_rate, learning_rate_input, grad_applier, args.max_time_step, device=device, environment=env) # gym.make(args.gym_env)) obj.port = env.port # slight hack: cache port value with the thread # TODO: include unity baseport # print "created training net %d" % i # training_thread_objs.append(obj) training_thread_objs[index] = obj
def start(self): self.global_t = 0 if not os.path.exists(os.path.join(FLAGS.model_dir, "images")): os.makedirs(os.path.join(FLAGS.model_dir, "images")) constants.device = "/cpu:0" if FLAGS.use_gpu: constants.device = "/gpu:0" initial_learning_rate = FLAGS.init_lr self.stop_requested = False # prepare session self.sess = tf.Session(config=tf.ConfigProto( log_device_placement=False, allow_soft_placement=True)) self.global_network = Network(name="core_%s" % constants.task_name) self.training_threads = [] learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=FLAGS.rmsp_alpha, momentum=0.0, epsilon=FLAGS.rmsp_epsilon, clip_norm=FLAGS.grad_norm_clip, device=constants.device) print("CREATING AGENTS") for i in range(FLAGS.threads): training_thread = A3CTrainingThread(i, self.global_network, initial_learning_rate, learning_rate_input, grad_applier, FLAGS.global_t_max, device=constants.device, sess=self.sess, name="agent_%s_%i" % (constants.task_name, i)) self.training_threads.append(training_thread) init = tf.initialize_all_variables() self.sess.run(init) # summary for tensorboard self.score_input = tf.placeholder(tf.int32) tf.scalar_summary("score", self.score_input) self.summary_op = tf.merge_all_summaries() self.summary_writer = tf.train.SummaryWriter(FLAGS.model_dir, self.sess.graph_def) # init or load checkpoint with saver self.saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(FLAGS.model_dir) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore( self.sess, checkpoint.model_checkpoint_path if FLAGS.checkpoint is None else FLAGS.checkpoint) print("checkpoint loaded:", checkpoint.model_checkpoint_path) tokens = checkpoint.model_checkpoint_path.split("-") # set global step self.global_t = int(tokens[-1]) print(">>> global step set: ", self.global_t) # set wall time self.wall_t_fname = FLAGS.model_dir + '/' + 'wall_t.' + str( self.global_t) with open(self.wall_t_fname, 'r') as f: self.wall_t = float(f.read()) else: print("Could not find old checkpoint") # set wall time self.wall_t = 0.0 if FLAGS.transfer_model is not None: self.global_network.load(self.sess, FLAGS.transfer_model) train_threads = [] for i in range(FLAGS.threads): train_threads.append( threading.Thread(target=self.train_function, args=(i, ))) signal.signal(signal.SIGINT, self.signal_handler) # set start time self.start_time = time.time() - self.wall_t for t in train_threads: t.start() print('Press Ctrl+C to stop') signal.pause() print('Now saving data. Please wait') for t in train_threads: t.join() self.save()
learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=RMSP_ALPHA, momentum=0.0, epsilon=RMSP_EPSILON, clip_norm=GRAD_NORM_CLIP, device=device) training_threads = [] for i in range(PARALLEL_SIZE): training_thread = A3CTrainingThread(i, global_network, 1.0, learning_rate_input, grad_applier, 8000000, device=device) training_threads.append(training_thread) sess = tf.Session() init = tf.initialize_all_variables() sess.run(init) saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print "checkpoint loaded:", checkpoint.model_checkpoint_path else:
def run_a3c_test(args): """Run A3C testing.""" GYM_ENV_NAME = args.gym_env.replace('-', '_') if args.use_gpu: assert args.cuda_devices != '' os.environ['CUDA_VISIBLE_DEVICES'] = args.cuda_devices else: os.environ['CUDA_VISIBLE_DEVICES'] = '' import tensorflow as tf if not os.path.exists('results/a3c'): os.makedirs('results/a3c') if args.folder is not None: folder = args.folder else: folder = 'results/a3c/{}'.format(GYM_ENV_NAME) end_str = '' if args.use_mnih_2015: end_str += '_mnih2015' if args.use_lstm: end_str += '_lstm' if args.unclipped_reward: end_str += '_rawreward' elif args.log_scale_reward: end_str += '_logreward' if args.transformed_bellman: end_str += '_transformedbell' if args.use_transfer: end_str += '_transfer' if args.not_transfer_conv2: end_str += '_noconv2' elif args.not_transfer_conv3 and args.use_mnih_2015: end_str += '_noconv3' elif args.not_transfer_fc1: end_str += '_nofc1' elif args.not_transfer_fc2: end_str += '_nofc2' if args.finetune_upper_layers_only: end_str += '_tune_upperlayers' if args.train_with_demo_num_steps > 0 \ or args.train_with_demo_num_epochs > 0: end_str += '_pretrain_ina3c' if args.use_demo_threads: end_str += '_demothreads' if args.load_pretrained_model: if args.use_pretrained_model_as_advice: end_str += '_modelasadvice' if args.use_pretrained_model_as_reward_shaping: end_str += '_modelasshaping' if args.padding == 'SAME': end_str += '_same' folder += end_str folder = pathlib.Path(folder) demo_memory_cam = None demo_cam_human = False if args.load_demo_cam: if args.demo_memory_folder is not None: demo_memory_folder = args.demo_memory_folder else: demo_memory_folder = 'collected_demo/{}'.format(GYM_ENV_NAME) demo_memory_folder = pathlib.Path(demo_memory_folder) if args.demo_cam_id is not None: demo_cam_human = True demo_cam, _, total_rewards_cam, _ = load_memory( name=None, demo_memory_folder=demo_memory_folder, demo_ids=args.demo_cam_id, imgs_normalized=False) demo_cam = demo_cam[int(args.demo_cam_id)] logger.info("loaded demo {} for testing CAM".format( args.demo_cam_id)) else: demo_cam_folder = pathlib.Path(args.demo_cam_folder) demo_cam = ReplayMemory() demo_cam.load(name='test_cam', folder=demo_cam_folder) logger.info("loaded demo {} for testing CAM".format( str(demo_cam_folder / 'test_cam'))) demo_memory_cam = np.zeros( (len(demo_cam), demo_cam.height, demo_cam.width, demo_cam.phi_length), dtype=np.float32) for i in range(len(demo_cam)): s0, _, _, _, _, _, t1, _ = demo_cam[i] demo_memory_cam[i] = np.copy(s0) del demo_cam device = "/cpu:0" gpu_options = None if args.use_gpu: device = "/gpu:"+os.environ["CUDA_VISIBLE_DEVICES"] gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=args.gpu_fraction) initial_learning_rate = args.initial_learn_rate logger.info('Initial Learning Rate={}'.format(initial_learning_rate)) time.sleep(2) global_t = 0 stop_requested = False game_state = GameState(env_id=args.gym_env) action_size = game_state.env.action_space.n config = tf.ConfigProto( gpu_options=gpu_options, log_device_placement=False, allow_soft_placement=True) input_shape = (84, 84, 4) if args.padding == 'VALID' else (88, 88, 4) if args.use_lstm: GameACLSTMNetwork.use_mnih_2015 = args.use_mnih_2015 global_network = GameACLSTMNetwork(action_size, -1, device) else: GameACFFNetwork.use_mnih_2015 = args.use_mnih_2015 global_network = GameACFFNetwork( action_size, -1, device, padding=args.padding, in_shape=input_shape) learning_rate_input = tf.placeholder(tf.float32, shape=(), name="opt_lr") grad_applier = tf.train.RMSPropOptimizer( learning_rate=learning_rate_input, decay=args.rmsp_alpha, epsilon=args.rmsp_epsilon) A3CTrainingThread.log_interval = args.log_interval A3CTrainingThread.performance_log_interval = args.performance_log_interval A3CTrainingThread.local_t_max = args.local_t_max A3CTrainingThread.demo_t_max = args.demo_t_max A3CTrainingThread.use_lstm = args.use_lstm A3CTrainingThread.action_size = action_size A3CTrainingThread.entropy_beta = args.entropy_beta A3CTrainingThread.demo_entropy_beta = args.demo_entropy_beta A3CTrainingThread.gamma = args.gamma A3CTrainingThread.use_mnih_2015 = args.use_mnih_2015 A3CTrainingThread.env_id = args.gym_env A3CTrainingThread.finetune_upper_layers_only = \ args.finetune_upper_layers_only A3CTrainingThread.transformed_bellman = args.transformed_bellman A3CTrainingThread.clip_norm = args.grad_norm_clip A3CTrainingThread.use_grad_cam = args.use_grad_cam if args.unclipped_reward: A3CTrainingThread.reward_type = "RAW" elif args.log_scale_reward: A3CTrainingThread.reward_type = "LOG" else: A3CTrainingThread.reward_type = "CLIP" if args.use_lstm: local_network = GameACLSTMNetwork(action_size, 0, device) else: local_network = GameACFFNetwork( action_size, 0, device, padding=args.padding, in_shape=input_shape) testing_thread = A3CTrainingThread( 0, global_network, local_network, initial_learning_rate, learning_rate_input, grad_applier, 0, device=device) # prepare session sess = tf.Session(config=config) if args.use_transfer: if args.transfer_folder is not None: transfer_folder = args.transfer_folder else: transfer_folder = 'results/pretrain_models/{}'.format(GYM_ENV_NAME) end_str = '' if args.use_mnih_2015: end_str += '_mnih2015' end_str += '_l2beta1E-04_batchprop' # TODO: make this an argument transfer_folder += end_str transfer_folder = pathlib.Path(transfer_folder) transfer_folder /= 'transfer_model' if args.not_transfer_conv2: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, ] elif (args.not_transfer_conv3 and args.use_mnih_2015): transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2, ] elif args.not_transfer_fc1: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2, ] if args.use_mnih_2015: transfer_var_list += [ global_network.W_conv3, global_network.b_conv3, ] elif args.not_transfer_fc2: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2, global_network.W_fc1, global_network.b_fc1, ] if args.use_mnih_2015: transfer_var_list += [ global_network.W_conv3, global_network.b_conv3, ] else: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2, global_network.W_fc1, global_network.b_fc1, global_network.W_fc2, global_network.b_fc2, ] if args.use_mnih_2015: transfer_var_list += [ global_network.W_conv3, global_network.b_conv3, ] global_network.load_transfer_model( sess, folder=transfer_folder, not_transfer_fc2=args.not_transfer_fc2, not_transfer_fc1=args.not_transfer_fc1, not_transfer_conv3=(args.not_transfer_conv3 and args.use_mnih_2015), not_transfer_conv2=args.not_transfer_conv2, var_list=transfer_var_list, ) def initialize_uninitialized(sess): global_vars = tf.global_variables() is_not_initialized = sess.run( [tf.is_variable_initialized(var) for var in global_vars]) not_initialized_vars = [ v for (v, f) in zip(global_vars, is_not_initialized) if not f] if len(not_initialized_vars): sess.run(tf.variables_initializer(not_initialized_vars)) if args.use_transfer: initialize_uninitialized(sess) else: sess.run(tf.global_variables_initializer()) # init or load checkpoint with saver root_saver = tf.train.Saver(max_to_keep=1) checkpoint = tf.train.get_checkpoint_state(str(folder)) if checkpoint and checkpoint.model_checkpoint_path: root_saver.restore(sess, checkpoint.model_checkpoint_path) logger.info("checkpoint loaded:{}".format( checkpoint.model_checkpoint_path)) tokens = checkpoint.model_checkpoint_path.split("-") # set global step global_t = int(tokens[-1]) logger.info(">>> global step set: {}".format(global_t)) else: logger.warning("Could not find old checkpoint") def test_function(): nonlocal global_t if args.use_transfer: from_folder = str(transfer_folder).split('/')[-2] else: from_folder = str(folder).split('/')[-1] from_folder = pathlib.Path(from_folder) save_folder = 'results/test_model/a3c' / from_folder prepare_dir(str(save_folder), empty=False) prepare_dir(str(save_folder / 'frames'), empty=False) # Evaluate model before training if not stop_requested: testing_thread.testing_model( sess, args.eval_max_steps, global_t, save_folder, demo_memory_cam=demo_memory_cam, demo_cam_human=demo_cam_human) def signal_handler(signal, frame): nonlocal stop_requested logger.info('You pressed Ctrl+C!') stop_requested = True if stop_requested and global_t == 0: sys.exit(1) test_thread = threading.Thread(target=test_function, args=()) signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) test_thread.start() print('Press Ctrl+C to stop') test_thread.join() sess.close()
import matplotlib.pyplot as plt import random from game_state import GameState from game_ac_network import GameACNetwork from a3c_training_thread import A3CTrainingThread from constants import ACTION_SIZE PARALLEL_SIZE = 8 CHECKPOINT_DIR = 'checkpoints' global_network = GameACNetwork(ACTION_SIZE) training_threads = [] for i in range(PARALLEL_SIZE): training_thread = A3CTrainingThread(i, global_network, 1.0, 8000000) training_threads.append(training_thread) sess = tf.Session() init = tf.initialize_all_variables() sess.run(init) saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print "checkpoint loaded:", checkpoint.model_checkpoint_path else: print "Could not find old checkpoint" W_conv1 = sess.run(global_network.W_conv1)
learning_rate_input = tf.placeholder("float") policy_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=0.99, momentum=0.0, epsilon=RMSP_EPSILON) value_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=0.99, momentum=0.0, epsilon=RMSP_EPSILON) training_threads = [] for i in range(PARALLEL_SIZE): training_thread = A3CTrainingThread(i, global_network, 1.0, learning_rate_input, policy_applier, value_applier, 8000000) training_threads.append(training_thread) sess = tf.Session() init = tf.initialize_all_variables() sess.run(init) saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print "checkpoint loaded:", checkpoint.model_checkpoint_path else: print "Could not find old checkpoint"
learning_rate_input = tf.placeholder("float") policy_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=0.99, momentum=0.0, epsilon=RMSP_EPSILON) value_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=0.99, momentum=0.0, epsilon=RMSP_EPSILON) for i in range(PARALLEL_SIZE): training_thread = A3CTrainingThread(i, global_network, initial_learning_rate, learning_rate_input, policy_applier, value_applier, MAX_TIME_STEP) training_threads.append(training_thread) # prepare session sess = tf.Session(config=tf.ConfigProto(log_device_placement=False)) init = tf.initialize_all_variables() sess.run(init) # summary for tensorboard summary_op = tf.merge_all_summaries() summary_writer = tf.train.SummaryWriter(LOG_FILE, sess.graph_def) # init or load checkpoint with saver saver = tf.train.Saver()
def run_a3c(args): """Run A3C experiment.""" GYM_ENV_NAME = args.gym_env.replace('-', '_') GAME_NAME = args.gym_env.replace('NoFrameskip-v4','') # setup folder name and path to folder folder = pathlib.Path(setup_folder(args, GYM_ENV_NAME)) # setup GPU (if applicable) import tensorflow as tf gpu_options = setup_gpu(tf, args.use_gpu, args.gpu_fraction) ###################################################### # setup default device device = "/cpu:0" global_t = 0 rewards = {'train': {}, 'eval': {}} best_model_reward = -(sys.maxsize) if args.load_pretrained_model: class_rewards = {'class_eval': {}} # setup logging info for analysis, see Section 4.2 of the paper sil_dict = { # count number of SIL updates "sil_ctr":{}, # total number of butter D sampled during SIL "sil_a3c_sampled":{}, # total number of buffer D samples (i.e., generated by A3C workers) used during SIL (i.e., passed max op) "sil_a3c_used":{}, # the return of used samples for buffer D "sil_a3c_used_return":{}, # total number of buffer R sampled during SIL "sil_rollout_sampled":{}, # total number of buffer R samples (i.e., generated by refresher worker) used during SIL (i.e., passed max op) "sil_rollout_used":{}, # the return of used samples for buffer R "sil_rollout_used_return":{}, # number of old samples still used (even after refreshing) "sil_old_used":{} } sil_ctr, sil_a3c_sampled, sil_a3c_used, sil_a3c_used_return = 0, 0, 0, 0 sil_rollout_sampled, sil_rollout_used, sil_rollout_used_return = 0, 0, 0 sil_old_used = 0 rollout_dict = { # total number of rollout performed "rollout_ctr": {}, # total number of successful rollout (i.e., Gnew > G) "rollout_added_ctr":{}, # the return of Gnew "rollout_new_return":{}, # the return of G "rollout_old_return":{} } rollout_ctr, rollout_added_ctr = 0, 0 rollout_new_return = 0 # this records the total, avg = this / rollout_added_ctr rollout_old_return = 0 # this records the total, avg = this / rollout_added_ctr # setup file names reward_fname = folder / '{}-a3c-rewards.pkl'.format(GYM_ENV_NAME) sil_fname = folder / '{}-a3c-dict-sil.pkl'.format(GYM_ENV_NAME) rollout_fname = folder / '{}-a3c-dict-rollout.pkl'.format(GYM_ENV_NAME) if args.load_pretrained_model: class_reward_fname = folder / '{}-class-rewards.pkl'.format(GYM_ENV_NAME) sharedmem_fname = folder / '{}-sharedmem.pkl'.format(GYM_ENV_NAME) sharedmem_params_fname = folder / '{}-sharedmem-params.pkl'.format(GYM_ENV_NAME) sharedmem_trees_fname = folder / '{}-sharedmem-trees.pkl'.format(GYM_ENV_NAME) rolloutmem_fname = folder / '{}-rolloutmem.pkl'.format(GYM_ENV_NAME) rolloutmem_params_fname = folder / '{}-rolloutmem-params.pkl'.format(GYM_ENV_NAME) rolloutmem_trees_fname = folder / '{}-rolloutmem-trees.pkl'.format(GYM_ENV_NAME) # for removing older ckpt, save mem space prev_ckpt_t = -1 stop_req = False game_state = GameState(env_id=args.gym_env) action_size = game_state.env.action_space.n game_state.close() del game_state.env del game_state input_shape = (args.input_shape, args.input_shape, 4) ####################################################### # setup global A3C GameACFFNetwork.use_mnih_2015 = args.use_mnih_2015 global_network = GameACFFNetwork( action_size, -1, device, padding=args.padding, in_shape=input_shape) logger.info('A3C Initial Learning Rate={}'.format(args.initial_learn_rate)) # setup pretrained model global_pretrained_model = None local_pretrained_model = None pretrain_graph = None # if use pretrained model to refresh # then must load pretrained model # otherwise, don't load model if args.use_lider and args.nstep_bc > 0: assert args.load_pretrained_model, "refreshing with other policies, must load a pre-trained model (TA or BC)" else: assert not args.load_pretrained_model, "refreshing with the current policy, don't load pre-trained models" if args.load_pretrained_model: pretrain_graph, global_pretrained_model = setup_pretrained_model(tf, args, action_size, input_shape, device="/gpu:0" if args.use_gpu else device) assert global_pretrained_model is not None assert pretrain_graph is not None time.sleep(2.0) # setup experience memory shared_memory = None # => this is BufferD rollout_buffer = None # => this is BufferR if args.use_sil: shared_memory = SILReplayMemory( action_size, max_len=args.memory_length, gamma=args.gamma, clip=False if args.unclipped_reward else True, height=input_shape[0], width=input_shape[1], phi_length=input_shape[2], priority=args.priority_memory, reward_constant=args.reward_constant) if args.use_lider and not args.onebuffer: rollout_buffer = SILReplayMemory( action_size, max_len=args.memory_length, gamma=args.gamma, clip=False if args.unclipped_reward else True, height=input_shape[0], width=input_shape[1], phi_length=input_shape[2], priority=args.priority_memory, reward_constant=args.reward_constant) # log memory information shared_memory.log() if args.use_lider and not args.onebuffer: rollout_buffer.log() ############## Setup Thread Workers BEGIN ################ # 17 total number of threads for all experiments assert args.parallel_size ==17, "use 17 workers for all experiments" startIndex = 0 all_workers = [] # a3c and sil learning rate and optimizer learning_rate_input = tf.placeholder(tf.float32, shape=(), name="opt_lr") grad_applier = tf.train.RMSPropOptimizer( learning_rate=learning_rate_input, decay=args.rmsp_alpha, epsilon=args.rmsp_epsilon) setup_common_worker(CommonWorker, args, action_size) # setup SIL worker sil_worker = None if args.use_sil: _device = "/gpu:0" if args.use_gpu else device sil_network = GameACFFNetwork( action_size, startIndex, device=_device, padding=args.padding, in_shape=input_shape) sil_worker = SILTrainingThread(startIndex, global_network, sil_network, args.initial_learn_rate, learning_rate_input, grad_applier, device=_device, batch_size=args.batch_size, use_rollout=args.use_lider, one_buffer=args.onebuffer, sampleR=args.sampleR) all_workers.append(sil_worker) startIndex += 1 # setup refresh worker refresh_worker = None if args.use_lider: _device = "/gpu:0" if args.use_gpu else device refresh_network = GameACFFNetwork( action_size, startIndex, device=_device, padding=args.padding, in_shape=input_shape) refresh_local_pretrained_model = None # if refreshing with other polies if args.nstep_bc > 0: refresh_local_pretrained_model = PretrainedModelNetwork( pretrain_graph, action_size, startIndex, padding=args.padding, in_shape=input_shape, sae=False, tied_weights=False, use_denoising=False, noise_factor=0.3, loss_function='mse', use_slv=False, device=_device) refresh_worker = RefreshThread( thread_index=startIndex, action_size=action_size, env_id=args.gym_env, global_a3c=global_network, local_a3c=refresh_network, update_in_rollout=args.update_in_rollout, nstep_bc=args.nstep_bc, global_pretrained_model=global_pretrained_model, local_pretrained_model=refresh_local_pretrained_model, transformed_bellman = args.transformed_bellman, device=_device, entropy_beta=args.entropy_beta, clip_norm=args.grad_norm_clip, grad_applier=grad_applier, initial_learn_rate=args.initial_learn_rate, learning_rate_input=learning_rate_input) all_workers.append(refresh_worker) startIndex += 1 # setup a3c workers setup_a3c_worker(A3CTrainingThread, args, startIndex) for i in range(startIndex, args.parallel_size): local_network = GameACFFNetwork( action_size, i, device="/cpu:0", padding=args.padding, in_shape=input_shape) a3c_worker = A3CTrainingThread( i, global_network, local_network, args.initial_learn_rate, learning_rate_input, grad_applier, device="/cpu:0", no_op_max=30) all_workers.append(a3c_worker) ############## Setup Thread Workers END ################ # setup config for tensorflow config = tf.ConfigProto( gpu_options=gpu_options, log_device_placement=False, allow_soft_placement=True) # prepare sessions sess = tf.Session(config=config) pretrain_sess = None if global_pretrained_model: pretrain_sess = tf.Session(config=config, graph=pretrain_graph) # initial pretrained model if pretrain_sess: assert args.pretrained_model_folder is not None global_pretrained_model.load( pretrain_sess, args.pretrained_model_folder) sess.run(tf.global_variables_initializer()) if global_pretrained_model: initialize_uninitialized(tf, pretrain_sess, global_pretrained_model) if local_pretrained_model: initialize_uninitialized(tf, pretrain_sess, local_pretrained_model) # summary writer for tensorboard summ_file = args.save_to+'log/a3c/{}/'.format(GYM_ENV_NAME) + str(folder)[58:] # str(folder)[12:] summary_writer = tf.summary.FileWriter(summ_file, sess.graph) # init or load checkpoint with saver root_saver = tf.train.Saver(max_to_keep=1) saver = tf.train.Saver(max_to_keep=1) best_saver = tf.train.Saver(max_to_keep=1) checkpoint = tf.train.get_checkpoint_state(str(folder)+'/model_checkpoints') if checkpoint and checkpoint.model_checkpoint_path: root_saver.restore(sess, checkpoint.model_checkpoint_path) logger.info("checkpoint loaded:{}".format( checkpoint.model_checkpoint_path)) tokens = checkpoint.model_checkpoint_path.split("-") # set global step global_t = int(tokens[-1]) logger.info(">>> global step set: {}".format(global_t)) tmp_t = (global_t // args.eval_freq) * args.eval_freq logger.info(">>> tmp_t: {}".format(tmp_t)) # set wall time wall_t = 0. # set up reward files best_reward_file = folder / 'model_best/best_model_reward' with best_reward_file.open('r') as f: best_model_reward = float(f.read()) # restore rewards rewards = restore_dict(reward_fname, global_t) logger.info(">>> restored: rewards") # restore loggings sil_dict = restore_dict(sil_fname, global_t) sil_ctr = sil_dict['sil_ctr'][tmp_t] sil_a3c_sampled = sil_dict['sil_a3c_sampled'][tmp_t] sil_a3c_used = sil_dict['sil_a3c_used'][tmp_t] sil_a3c_used_return = sil_dict['sil_a3c_used_return'][tmp_t] sil_rollout_sampled = sil_dict['sil_rollout_sampled'][tmp_t] sil_rollout_used = sil_dict['sil_rollout_used'][tmp_t] sil_rollout_used_return = sil_dict['sil_rollout_used_return'][tmp_t] sil_old_used = sil_dict['sil_old_used'][tmp_t] logger.info(">>> restored: sil_dict") rollout_dict = restore_dict(rollout_fname, global_t) rollout_ctr = rollout_dict['rollout_ctr'][tmp_t] rollout_added_ctr = rollout_dict['rollout_added_ctr'][tmp_t] rollout_new_return = rollout_dict['rollout_new_return'][tmp_t] rollout_old_return = rollout_dict['rollout_old_return'][tmp_t] logger.info(">>> restored: rollout_dict") if args.load_pretrained_model: class_reward_file = folder / '{}-class-rewards.pkl'.format(GYM_ENV_NAME) class_rewards = restore_dict(class_reward_file, global_t) # restore replay buffers (if saved) if args.checkpoint_buffer: # restore buffer D if args.use_sil and args.priority_memory: shared_memory = restore_buffer(sharedmem_fname, shared_memory, global_t) shared_memory = restore_buffer_trees(sharedmem_trees_fname, shared_memory, global_t) shared_memory = restore_buffer_params(sharedmem_params_fname, shared_memory, global_t) logger.info(">>> restored: shared_memory (Buffer D)") shared_memory.log() # restore buffer R if args.use_lider and not args.onebuffer: rollout_buffer = restore_buffer(rolloutmem_fname, rollout_buffer, global_t) rollout_buffer = restore_buffer_trees(rolloutmem_trees_fname, rollout_buffer, global_t) rollout_buffer = restore_buffer_params(rolloutmem_params_fname, rollout_buffer, global_t) logger.info(">>> restored: rollout_buffer (Buffer R)") rollout_buffer.log() # if all restores okay, remove old ckpt to save storage space prev_ckpt_t = global_t else: logger.warning("Could not find old checkpoint") wall_t = 0.0 prepare_dir(folder, empty=True) prepare_dir(folder / 'model_checkpoints', empty=True) prepare_dir(folder / 'model_best', empty=True) prepare_dir(folder / 'frames', empty=True) lock = threading.Lock() # next saving global_t def next_t(current_t, freq): return np.ceil((current_t + 0.00001) / freq) * freq next_global_t = next_t(global_t, args.eval_freq) next_save_t = next_t( global_t, args.eval_freq*args.checkpoint_freq) step_t = 0 def train_function(parallel_idx, th_ctr, ep_queue, net_updates): nonlocal global_t, step_t, rewards, class_rewards, lock, \ next_save_t, next_global_t, prev_ckpt_t nonlocal shared_memory, rollout_buffer nonlocal sil_dict, sil_ctr, sil_a3c_sampled, sil_a3c_used, sil_a3c_used_return, \ sil_rollout_sampled, sil_rollout_used, sil_rollout_used_return, \ sil_old_used nonlocal rollout_dict, rollout_ctr, rollout_added_ctr, \ rollout_new_return, rollout_old_return parallel_worker = all_workers[parallel_idx] parallel_worker.set_summary_writer(summary_writer) with lock: # Evaluate model before training if not stop_req and global_t == 0 and step_t == 0: rewards['eval'][step_t] = parallel_worker.testing( sess, args.eval_max_steps, global_t, folder, worker=all_workers[-1]) # testing pretrained TA or BC in game if args.load_pretrained_model: assert pretrain_sess is not None assert global_pretrained_model is not None class_rewards['class_eval'][step_t] = \ parallel_worker.test_loaded_classifier(global_t=global_t, max_eps=50, # testing 50 episodes sess=pretrain_sess, worker=all_workers[-1], model=global_pretrained_model) # log pretrained model performance class_eval_file = pathlib.Path(args.pretrained_model_folder[:21]+\ str(GAME_NAME)+"/"+str(GAME_NAME)+'-model-eval.txt') class_std = np.std(class_rewards['class_eval'][step_t][-1]) class_mean = np.mean(class_rewards['class_eval'][step_t][-1]) with class_eval_file.open('w') as f: f.write("class_mean: \n" + str(class_mean) + "\n") f.write("class_std: \n" + str(class_std) + "\n") f.write("class_rewards: \n" + str(class_rewards['class_eval'][step_t][-1]) + "\n") checkpt_file = folder / 'model_checkpoints' checkpt_file /= '{}_checkpoint'.format(GYM_ENV_NAME) saver.save(sess, str(checkpt_file), global_step=global_t) save_best_model(rewards['eval'][global_t][0]) # saving worker info to dicts for analysis sil_dict['sil_ctr'][step_t] = sil_ctr sil_dict['sil_a3c_sampled'][step_t] = sil_a3c_sampled sil_dict['sil_a3c_used'][step_t] = sil_a3c_used sil_dict['sil_a3c_used_return'][step_t] = sil_a3c_used_return sil_dict['sil_rollout_sampled'][step_t] = sil_rollout_sampled sil_dict['sil_rollout_used'][step_t] = sil_rollout_used sil_dict['sil_rollout_used_return'][step_t] = sil_rollout_used_return sil_dict['sil_old_used'][step_t] = sil_old_used rollout_dict['rollout_ctr'][step_t] = rollout_ctr rollout_dict['rollout_added_ctr'][step_t] = rollout_added_ctr rollout_dict['rollout_new_return'][step_t] = rollout_new_return rollout_dict['rollout_old_return'][step_t] = rollout_old_return # dump pickle dump_pickle([rewards, sil_dict, rollout_dict], [reward_fname, sil_fname, rollout_fname], global_t) if args.load_pretrained_model: dump_pickle([class_rewards], [class_reward_fname], global_t) logger.info('Dump pickle at step {}'.format(global_t)) # save replay buffer (only works under priority mem) if args.checkpoint_buffer: if shared_memory is not None and args.priority_memory: params = [shared_memory.buff._next_idx, shared_memory.buff._max_priority] trees = [shared_memory.buff._it_sum._value, shared_memory.buff._it_min._value] dump_pickle([shared_memory.buff._storage, params, trees], [sharedmem_fname, sharedmem_params_fname, sharedmem_trees_fname], global_t) logger.info('Saving shared_memory') if rollout_buffer is not None and args.priority_memory: params = [rollout_buffer.buff._next_idx, rollout_buffer.buff._max_priority] trees = [rollout_buffer.buff._it_sum._value, rollout_buffer.buff._it_min._value] dump_pickle([rollout_buffer.buff._storage, params, trees], [rolloutmem_fname, rolloutmem_params_fname, rolloutmem_trees_fname], global_t) logger.info('Saving rollout_buffer') prev_ckpt_t = global_t step_t = 1 # set start_time start_time = time.time() - wall_t parallel_worker.set_start_time(start_time) if parallel_worker.is_sil_thread: sil_interval = 0 # bigger number => slower SIL updates m_repeat = 4 min_mem = args.batch_size * m_repeat sil_train_flag = len(shared_memory) >= min_mem while True: if stop_req: return if global_t >= (args.max_time_step * args.max_time_step_fraction): return if parallel_worker.is_sil_thread: # before sil starts, init local count local_sil_ctr = 0 local_sil_a3c_sampled, local_sil_a3c_used, local_sil_a3c_used_return = 0, 0, 0 local_sil_rollout_sampled, local_sil_rollout_used, local_sil_rollout_used_return = 0, 0, 0 local_sil_old_used = 0 if net_updates.qsize() >= sil_interval \ and len(shared_memory) >= min_mem: sil_train_flag = True if sil_train_flag: sil_train_flag = False th_ctr.get() train_out = parallel_worker.sil_train( sess, global_t, shared_memory, m_repeat, rollout_buffer=rollout_buffer) local_sil_ctr, local_sil_a3c_sampled, local_sil_a3c_used, \ local_sil_a3c_used_return, \ local_sil_rollout_sampled, local_sil_rollout_used, \ local_sil_rollout_used_return, \ local_sil_old_used = train_out th_ctr.put(1) with net_updates.mutex: net_updates.queue.clear() if args.use_lider: parallel_worker.record_sil(sil_ctr=sil_ctr, total_used=(sil_a3c_used + sil_rollout_used), num_a3c_used=sil_a3c_used, a3c_used_return=sil_a3c_used_return/(sil_a3c_used+1),#add one in case divide by zero rollout_used=sil_rollout_used, rollout_used_return=sil_rollout_used_return/(sil_rollout_used+1), old_used=sil_old_used, global_t=global_t) if sil_ctr % 200 == 0 and sil_ctr > 0: rollout_buffsize = 0 if not args.onebuffer: rollout_buffsize = len(rollout_buffer) log_data = (sil_ctr, len(shared_memory), rollout_buffsize, sil_a3c_used+sil_rollout_used, args.batch_size*sil_ctr, sil_a3c_used, sil_a3c_used_return/(sil_a3c_used+1), sil_rollout_used, sil_rollout_used_return/(sil_rollout_used+1), sil_old_used) logger.info("SIL: sil_ctr={0:}" " sil_memory_size={1:}" " rollout_buffer_size={2:}" " total_sample_used={3:}/{4:}" " a3c_used={5:}" " a3c_used_return_avg={6:.2f}" " rollout_used={7:}" " rollout_used_return_avg={8:.2f}" " old_used={9:}".format(*log_data)) else: parallel_worker.record_sil(sil_ctr=sil_ctr, total_used=(sil_a3c_used + sil_rollout_used), num_a3c_used=sil_a3c_used, rollout_used=sil_rollout_used, global_t=global_t) if sil_ctr % 200 == 0 and sil_ctr > 0: log_data = (sil_ctr, sil_a3c_used+sil_rollout_used, args.batch_size*sil_ctr, sil_a3c_used, len(shared_memory)) logger.info("SIL: sil_ctr={0:}" " total_sample_used={1:}/{2:}" " a3c_used={3:}" " sil_memory_size={4:}".format(*log_data)) # Adding episodes to SIL memory is centralize to ensure # sampling and updating of priorities does not become a problem # since we add new episodes to SIL at once and during # SIL training it is guaranteed that SIL memory is untouched. max = args.parallel_size while not ep_queue.empty(): data = ep_queue.get() parallel_worker.episode.set_data(*data) shared_memory.extend(parallel_worker.episode) parallel_worker.episode.reset() max -= 1 if max <= 0: # This ensures that SIL has a chance to train break diff_global_t = 0 # centralized rollout counting local_rollout_ctr, local_rollout_added_ctr = 0, 0 local_rollout_new_return, local_rollout_old_return = 0, 0 elif parallel_worker.is_refresh_thread: # before refresh starts, init local count diff_global_t = 0 local_rollout_ctr, local_rollout_added_ctr = 0, 0 local_rollout_new_return, local_rollout_old_return = 0, 0 if len(shared_memory) >= 1: th_ctr.get() # randomly sample a state from buffer D sample = shared_memory.sample_one_random() # after sample, flip refreshed to True # TODO: fix this so that only *succesful* refresh is flipped to True # currently counting *all* refresh as True assert sample[-1] == True train_out = parallel_worker.rollout(sess, folder, pretrain_sess, global_t, sample, args.addall, args.max_ep_step, args.nstep_bc, args.update_in_rollout) diff_global_t, episode_end, part_end, local_rollout_ctr, \ local_rollout_added_ctr, add, local_rollout_new_return, \ local_rollout_old_return = train_out th_ctr.put(1) if rollout_ctr % 20 == 0 and rollout_ctr > 0: log_msg = "ROLLOUT: rollout_ctr={} added_rollout_ct={} worker={}".format( rollout_ctr, rollout_added_ctr, parallel_worker.thread_idx) logger.info(log_msg) logger.info("ROLLOUT Gnew: {}, G: {}".format(local_rollout_new_return, local_rollout_old_return)) # should always part_end, i.e., end of episode # and only add if new return is better (if not LiDER-AddAll) if part_end and add: if not args.onebuffer: # directly put into Buffer R rollout_buffer.extend(parallel_worker.episode) else: # Buffer D add sample is centralized when OneBuffer ep_queue.put(parallel_worker.episode.get_data()) parallel_worker.episode.reset() # centralized SIL counting local_sil_ctr = 0 local_sil_a3c_sampled, local_sil_a3c_used, local_sil_a3c_used_return = 0, 0, 0 local_sil_rollout_sampled, local_sil_rollout_used, local_sil_rollout_used_return = 0, 0, 0 local_sil_old_used = 0 # a3c training thread worker else: th_ctr.get() train_out = parallel_worker.train(sess, global_t, rewards) diff_global_t, episode_end, part_end = train_out th_ctr.put(1) if args.use_sil: net_updates.put(1) if part_end: ep_queue.put(parallel_worker.episode.get_data()) parallel_worker.episode.reset() # centralized SIL counting local_sil_ctr = 0 local_sil_a3c_sampled, local_sil_a3c_used, local_sil_a3c_used_return = 0, 0, 0 local_sil_rollout_sampled, local_sil_rollout_used, local_sil_rollout_used_return = 0, 0, 0 local_sil_old_used = 0 # centralized rollout counting local_rollout_ctr, local_rollout_added_ctr = 0, 0 local_rollout_new_return, local_rollout_old_return = 0, 0 # ensure only one thread is updating global_t at a time with lock: global_t += diff_global_t # centralize increasing count for SIL and Rollout sil_ctr += local_sil_ctr sil_a3c_sampled += local_sil_a3c_sampled sil_a3c_used += local_sil_a3c_used sil_a3c_used_return += local_sil_a3c_used_return sil_rollout_sampled += local_sil_rollout_sampled sil_rollout_used += local_sil_rollout_used sil_rollout_used_return += local_sil_rollout_used_return sil_old_used += local_sil_old_used rollout_ctr += local_rollout_ctr rollout_added_ctr += local_rollout_added_ctr rollout_new_return += local_rollout_new_return rollout_old_return += local_rollout_old_return # if during a thread's update, global_t has reached a evaluation interval if global_t > next_global_t: next_global_t = next_t(global_t, args.eval_freq) step_t = int(next_global_t - args.eval_freq) # wait for all threads to be done before testing while not stop_req and th_ctr.qsize() < len(all_workers): time.sleep(0.001) step_t = int(next_global_t - args.eval_freq) # Evaluate for 125,000 steps rewards['eval'][step_t] = parallel_worker.testing( sess, args.eval_max_steps, step_t, folder, worker=all_workers[-1]) save_best_model(rewards['eval'][step_t][0]) last_reward = rewards['eval'][step_t][0] # saving worker info to dicts # SIL sil_dict['sil_ctr'][step_t] = sil_ctr sil_dict['sil_a3c_sampled'][step_t] = sil_a3c_sampled sil_dict['sil_a3c_used'][step_t] = sil_a3c_used sil_dict['sil_a3c_used_return'][step_t] = sil_a3c_used_return sil_dict['sil_rollout_sampled'][step_t] = sil_rollout_sampled sil_dict['sil_rollout_used'][step_t] = sil_rollout_used sil_dict['sil_rollout_used_return'][step_t] = sil_rollout_used_return sil_dict['sil_old_used'][step_t] = sil_old_used # ROLLOUT rollout_dict['rollout_ctr'][step_t] = rollout_ctr rollout_dict['rollout_added_ctr'][step_t] = rollout_added_ctr rollout_dict['rollout_new_return'][step_t] = rollout_new_return rollout_dict['rollout_old_return'][step_t] = rollout_old_return # save ckpt after done with eval if global_t > next_save_t: next_save_t = next_t(global_t, args.eval_freq*args.checkpoint_freq) # dump pickle dump_pickle([rewards, sil_dict, rollout_dict], [reward_fname, sil_fname, rollout_fname], global_t) if args.load_pretrained_model: dump_pickle([class_rewards], [class_reward_fname], global_t) logger.info('Dump pickle at step {}'.format(global_t)) # save replay buffer (only works for priority mem for now) if args.checkpoint_buffer: if shared_memory is not None and args.priority_memory: params = [shared_memory.buff._next_idx, shared_memory.buff._max_priority] trees = [shared_memory.buff._it_sum._value, shared_memory.buff._it_min._value] dump_pickle([shared_memory.buff._storage, params, trees], [sharedmem_fname, sharedmem_params_fname, sharedmem_trees_fname], global_t) logger.info('Saved shared_memory') if rollout_buffer is not None and args.priority_memory: params = [rollout_buffer.buff._next_idx, rollout_buffer.buff._max_priority] trees = [rollout_buffer.buff._it_sum._value, rollout_buffer.buff._it_min._value] dump_pickle([rollout_buffer.buff._storage, params, trees], [rolloutmem_fname, rolloutmem_params_fname, rolloutmem_trees_fname], global_t) logger.info('Saved rollout_buffer') # save a3c after saving buffer -- in case saving buffer OOM # so that at least we can revert back to the previous ckpt checkpt_file = folder / 'model_checkpoints' checkpt_file /= '{}_checkpoint'.format(GYM_ENV_NAME) saver.save(sess, str(checkpt_file), global_step=global_t, write_meta_graph=False) logger.info('Saved model ckpt') # if everything saves okay, clean up previous ckpt to save space remove_pickle([reward_fname, sil_fname, rollout_fname], prev_ckpt_t) if args.load_pretrained_model: remove_pickle([class_reward_fname], prev_ckpt_t) remove_pickle([sharedmem_fname, sharedmem_params_fname, sharedmem_trees_fname], prev_ckpt_t) if rollout_buffer is not None and args.priority_memory: remove_pickle([rolloutmem_fname, rolloutmem_params_fname, rolloutmem_trees_fname], prev_ckpt_t) logger.info('Removed ckpt from step {}'.format(prev_ckpt_t)) prev_ckpt_t = global_t def signal_handler(signal, frame): nonlocal stop_req logger.info('You pressed Ctrl+C!') stop_req = True if stop_req and global_t == 0: sys.exit(1) def save_best_model(test_reward): nonlocal best_model_reward if test_reward > best_model_reward: best_model_reward = test_reward best_reward_file = folder / 'model_best/best_model_reward' with best_reward_file.open('w') as f: f.write(str(best_model_reward)) best_checkpt_file = folder / 'model_best' best_checkpt_file /= '{}_checkpoint'.format(GYM_ENV_NAME) best_saver.save(sess, str(best_checkpt_file)) train_threads = [] th_ctr = Queue() for i in range(args.parallel_size): th_ctr.put(1) episodes_queue = None net_updates = None if args.use_sil: episodes_queue = Queue() net_updates = Queue() for i in range(args.parallel_size): worker_thread = Thread( target=train_function, args=(i, th_ctr, episodes_queue, net_updates,)) train_threads.append(worker_thread) signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) # set start time start_time = time.time() - wall_t for t in train_threads: t.start() print('Press Ctrl+C to stop') for t in train_threads: t.join() logger.info('Now saving data. Please wait') # write wall time wall_t = time.time() - start_time wall_t_fname = folder / 'wall_t.{}'.format(global_t) with wall_t_fname.open('w') as f: f.write(str(wall_t)) # save final model checkpoint_file = str(folder / '{}_checkpoint_a3c'.format(GYM_ENV_NAME)) root_saver.save(sess, checkpoint_file, global_step=global_t) dump_final_pickle([rewards, sil_dict, rollout_dict], [reward_fname, sil_fname, rollout_fname]) logger.info('Data saved!') # if everything saves okay & is done training (not because of pressed Ctrl+C), # clean up previous ckpt to save space if global_t >= (args.max_time_step * args.max_time_step_fraction): remove_pickle([reward_fname, sil_fname, rollout_fname], prev_ckpt_t) if args.load_pretrained_model: remove_pickle([class_reward_fname], prev_ckpt_t) remove_pickle([sharedmem_fname, sharedmem_params_fname, sharedmem_trees_fname], prev_ckpt_t) if rollout_buffer is not None and args.priority_memory: remove_pickle([rolloutmem_fname, rolloutmem_params_fname, rolloutmem_trees_fname], prev_ckpt_t) logger.info('Done training, removed ckpt from step {}'.format(prev_ckpt_t)) sess.close() if pretrain_sess: pretrain_sess.close()
learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=RMSP_ALPHA, momentum=0.0, epsilon=RMSP_EPSILON, clip_norm=GRAD_NORM_CLIP, device=device) for i in range(args.threads): training_thread = A3CTrainingThread( i, global_network, args.initial_learning_rate, learning_rate_input, grad_applier, args.max_time_step, device=device, environment=make_env(i + 1)) #gym.make(args.gym_env)) training_threads.append(training_thread) #os._exit(0) ############################################################################ # summary for tensorboard from summaries import setup_summaries ############################################################################ # prepare session
device=device) # 设置同步mutex mutex = threading.Lock() #设置同步的condition condition = threading.Condition() # 设置共享变量,用来线程间通信,存放刚完成前一道工序的job,即用来通知后一道工序的机器来启动该工序为等待状态 arrived_jobs = list() for i in range(MACHINE_SIZE): arrived_jobs.append(list()) terminal_count = [0] for i in range(PARALLEL_SIZE): training_thread = A3CTrainingThread(i, initial_learning_rate, learning_rate_input, grad_applier, MAX_TIME_EPISODE, device=device, arrived_jobs=arrived_jobs, condition=condition) training_threads.append(training_thread) # prepare session sess = tf.Session(config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)) init = tf.global_variables_initializer() sess.run(init) # summary for tensorboard score_input = tf.placeholder(tf.int32) tf.summary.scalar("score", score_input)
def get_game(thread_index): from game_state_env import GameStateGymEnv import gym env = gym.make('Catcher-v0') return GameStateGymEnv(env) for i in range(PARALLEL_SIZE): training_thread = A3CTrainingThread(i, global_network, initial_learning_rate, learning_rate_input, grad_applier, MAX_TIME_STEP, device=device, game_function=get_game, local_network=make_network) training_threads.append(training_thread) # prepare session sess = tf.Session(config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)) init = tf.initialize_all_variables() sess.run(init) # summary for tensorboard score_input = tf.placeholder(tf.int32)
def run_a3c(args): """ python3 run_experiment.py --gym-env=PongNoFrameskip-v4 --parallel-size=16 --initial-learn-rate=7e-4 --use-lstm --use-mnih-2015 python3 run_experiment.py --gym-env=PongNoFrameskip-v4 --parallel-size=16 --initial-learn-rate=7e-4 --use-lstm --use-mnih-2015 --use-transfer --not-transfer-fc2 --transfer-folder=<> python3 run_experiment.py --gym-env=PongNoFrameskip-v4 --parallel-size=16 --initial-learn-rate=7e-4 --use-lstm --use-mnih-2015 --use-transfer --not-transfer-fc2 --transfer-folder=<> --load-pretrained-model --onevsall-mtl --pretrained-model-folder=<> --use-pretrained-model-as-advice --use-pretrained-model-as-reward-shaping """ from game_ac_network import GameACFFNetwork, GameACLSTMNetwork from a3c_training_thread import A3CTrainingThread if args.use_gpu: assert args.cuda_devices != '' os.environ['CUDA_VISIBLE_DEVICES'] = args.cuda_devices else: os.environ['CUDA_VISIBLE_DEVICES'] = '' import tensorflow as tf def log_uniform(lo, hi, rate): log_lo = math.log(lo) log_hi = math.log(hi) v = log_lo * (1 - rate) + log_hi * rate return math.exp(v) if not os.path.exists('results/a3c'): os.makedirs('results/a3c') if args.folder is not None: folder = 'results/a3c/{}_{}'.format(args.gym_env.replace('-', '_'), args.folder) else: folder = 'results/a3c/{}'.format(args.gym_env.replace('-', '_')) end_str = '' if args.use_mnih_2015: end_str += '_mnih2015' if args.use_lstm: end_str += '_lstm' if args.unclipped_reward: end_str += '_rawreward' elif args.log_scale_reward: end_str += '_logreward' if args.transformed_bellman: end_str += '_transformedbell' if args.use_transfer: end_str += '_transfer' if args.not_transfer_conv2: end_str += '_noconv2' elif args.not_transfer_conv3 and args.use_mnih_2015: end_str += '_noconv3' elif args.not_transfer_fc1: end_str += '_nofc1' elif args.not_transfer_fc2: end_str += '_nofc2' if args.finetune_upper_layers_only: end_str += '_tune_upperlayers' if args.train_with_demo_num_steps > 0 or args.train_with_demo_num_epochs > 0: end_str += '_pretrain_ina3c' if args.use_demo_threads: end_str += '_demothreads' if args.load_pretrained_model: if args.use_pretrained_model_as_advice: end_str += '_modelasadvice' if args.use_pretrained_model_as_reward_shaping: end_str += '_modelasshaping' folder += end_str if args.append_experiment_num is not None: folder += '_' + args.append_experiment_num if False: from common.util import LogFormatter fh = logging.FileHandler('{}/a3c.log'.format(folder), mode='w') fh.setLevel(logging.DEBUG) formatter = LogFormatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) logger.addHandler(fh) demo_memory = None num_demos = 0 max_reward = 0. if args.load_memory or args.load_demo_cam: if args.demo_memory_folder is not None: demo_memory_folder = args.demo_memory_folder else: demo_memory_folder = 'collected_demo/{}'.format( args.gym_env.replace('-', '_')) if args.load_memory: # FIXME: use new load_memory function demo_memory, actions_ctr, max_reward = load_memory( args.gym_env, demo_memory_folder, imgs_normalized=True) #, create_symmetry=True) action_freq = [ actions_ctr[a] for a in range(demo_memory[0].num_actions) ] num_demos = len(demo_memory) demo_memory_cam = None if args.load_demo_cam: demo_cam, _, total_rewards_cam, _ = load_memory( name=None, demo_memory_folder=demo_memory_folder, demo_ids=args.demo_cam_id, imgs_normalized=False) demo_cam = demo_cam[int(args.demo_cam_id)] demo_memory_cam = np.zeros((len(demo_cam), demo_cam.height, demo_cam.width, demo_cam.phi_length), dtype=np.float32) for i in range(len(demo_cam)): s0 = (demo_cam[i])[0] demo_memory_cam[i] = np.copy(s0) del demo_cam logger.info("loaded demo {} for testing CAM".format(args.demo_cam_id)) device = "/cpu:0" gpu_options = None if args.use_gpu: device = "/gpu:" + os.environ["CUDA_VISIBLE_DEVICES"] gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=args.gpu_fraction) initial_learning_rate = args.initial_learn_rate logger.info('Initial Learning Rate={}'.format(initial_learning_rate)) time.sleep(2) global_t = 0 pretrain_global_t = 0 pretrain_epoch = 0 rewards = {'train': {}, 'eval': {}} best_model_reward = -(sys.maxsize) stop_requested = False game_state = GameState(env_id=args.gym_env) action_size = game_state.env.action_space.n game_state.close() del game_state.env del game_state config = tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False, allow_soft_placement=True) pretrained_model = None pretrained_model_sess = None if args.load_pretrained_model: if args.onevsall_mtl: from game_class_network import MTLBinaryClassNetwork as PretrainedModelNetwork elif args.onevsall_mtl_linear: from game_class_network import MTLMultivariateNetwork as PretrainedModelNetwork else: from game_class_network import MultiClassNetwork as PretrainedModelNetwork logger.error("Not supported yet!") assert False if args.pretrained_model_folder is not None: pretrained_model_folder = args.pretrained_model_folder else: pretrained_model_folder = '{}_classifier_use_mnih_onevsall_mtl'.format( args.gym_env.replace('-', '_')) PretrainedModelNetwork.use_mnih_2015 = args.use_mnih_2015 pretrained_model = PretrainedModelNetwork(action_size, -1, device) pretrained_model_sess = tf.Session(config=config, graph=pretrained_model.graph) pretrained_model.load( pretrained_model_sess, '{}/{}_checkpoint'.format(pretrained_model_folder, args.gym_env.replace('-', '_'))) if args.use_lstm: GameACLSTMNetwork.use_mnih_2015 = args.use_mnih_2015 global_network = GameACLSTMNetwork(action_size, -1, device) else: GameACFFNetwork.use_mnih_2015 = args.use_mnih_2015 global_network = GameACFFNetwork(action_size, -1, device) training_threads = [] learning_rate_input = tf.placeholder(tf.float32, shape=(), name="opt_lr") grad_applier = tf.train.RMSPropOptimizer(learning_rate=learning_rate_input, decay=args.rmsp_alpha, epsilon=args.rmsp_epsilon) A3CTrainingThread.log_interval = args.log_interval A3CTrainingThread.performance_log_interval = args.performance_log_interval A3CTrainingThread.local_t_max = args.local_t_max A3CTrainingThread.demo_t_max = args.demo_t_max A3CTrainingThread.use_lstm = args.use_lstm A3CTrainingThread.action_size = action_size A3CTrainingThread.entropy_beta = args.entropy_beta A3CTrainingThread.demo_entropy_beta = args.demo_entropy_beta A3CTrainingThread.gamma = args.gamma A3CTrainingThread.use_mnih_2015 = args.use_mnih_2015 A3CTrainingThread.env_id = args.gym_env A3CTrainingThread.finetune_upper_layers_only = args.finetune_upper_layers_only A3CTrainingThread.transformed_bellman = args.transformed_bellman A3CTrainingThread.clip_norm = args.grad_norm_clip A3CTrainingThread.use_grad_cam = args.use_grad_cam if args.unclipped_reward: A3CTrainingThread.reward_type = "RAW" elif args.log_scale_reward: A3CTrainingThread.reward_type = "LOG" else: A3CTrainingThread.reward_type = "CLIP" n_shapers = args.parallel_size #int(args.parallel_size * .25) mod = args.parallel_size // n_shapers for i in range(args.parallel_size): is_reward_shape = False is_advice = False if i % mod == 0: is_reward_shape = args.use_pretrained_model_as_reward_shaping is_advice = args.use_pretrained_model_as_advice training_thread = A3CTrainingThread( i, global_network, initial_learning_rate, learning_rate_input, grad_applier, args.max_time_step, device=device, pretrained_model=pretrained_model, pretrained_model_sess=pretrained_model_sess, advice=is_advice, reward_shaping=is_reward_shape) training_threads.append(training_thread) # prepare session sess = tf.Session(config=config) if args.use_transfer: if args.transfer_folder is not None: transfer_folder = args.transfer_folder else: transfer_folder = 'results/pretrain_models/{}'.format( args.gym_env.replace('-', '_')) end_str = '' if args.use_mnih_2015: end_str += '_mnih2015' end_str += '_l2beta1E-04_batchprop' #TODO: make this an argument transfer_folder += end_str transfer_folder += '/transfer_model' if args.not_transfer_conv2: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1 ] elif (args.not_transfer_conv3 and args.use_mnih_2015): transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2 ] elif args.not_transfer_fc1: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2, ] if args.use_mnih_2015: transfer_var_list += [ global_network.W_conv3, global_network.b_conv3 ] elif args.not_transfer_fc2: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2, global_network.W_fc1, global_network.b_fc1 ] if args.use_mnih_2015: transfer_var_list += [ global_network.W_conv3, global_network.b_conv3 ] else: transfer_var_list = [ global_network.W_conv1, global_network.b_conv1, global_network.W_conv2, global_network.b_conv2, global_network.W_fc1, global_network.b_fc1, global_network.W_fc2, global_network.b_fc2 ] if args.use_mnih_2015: transfer_var_list += [ global_network.W_conv3, global_network.b_conv3 ] global_network.load_transfer_model( sess, folder=transfer_folder, not_transfer_fc2=args.not_transfer_fc2, not_transfer_fc1=args.not_transfer_fc1, not_transfer_conv3=(args.not_transfer_conv3 and args.use_mnih_2015), not_transfer_conv2=args.not_transfer_conv2, var_list=transfer_var_list) def initialize_uninitialized(sess): global_vars = tf.global_variables() is_not_initialized = sess.run( [tf.is_variable_initialized(var) for var in global_vars]) not_initialized_vars = [ v for (v, f) in zip(global_vars, is_not_initialized) if not f ] if len(not_initialized_vars): sess.run(tf.variables_initializer(not_initialized_vars)) if args.use_transfer: initialize_uninitialized(sess) else: sess.run(tf.global_variables_initializer()) # summary writer for tensorboard summary_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter( 'results/log/a3c/{}/'.format(args.gym_env.replace('-', '_')) + folder[12:], sess.graph) # init or load checkpoint with saver root_saver = tf.train.Saver(max_to_keep=1) saver = tf.train.Saver(max_to_keep=6) best_saver = tf.train.Saver(max_to_keep=1) checkpoint = tf.train.get_checkpoint_state(folder) if checkpoint and checkpoint.model_checkpoint_path: root_saver.restore(sess, checkpoint.model_checkpoint_path) logger.info("checkpoint loaded:{}".format( checkpoint.model_checkpoint_path)) tokens = checkpoint.model_checkpoint_path.split("-") # set global step global_t = int(tokens[-1]) logger.info(">>> global step set: {}".format(global_t)) # set wall time wall_t_fname = folder + '/' + 'wall_t.' + str(global_t) with open(wall_t_fname, 'r') as f: wall_t = float(f.read()) with open(folder + '/pretrain_global_t', 'r') as f: pretrain_global_t = int(f.read()) with open(folder + '/model_best/best_model_reward', 'r') as f_best_model_reward: best_model_reward = float(f_best_model_reward.read()) rewards = pickle.load( open( folder + '/' + args.gym_env.replace('-', '_') + '-a3c-rewards.pkl', 'rb')) else: logger.warning("Could not find old checkpoint") # set wall time wall_t = 0.0 prepare_dir(folder, empty=True) prepare_dir(folder + '/model_checkpoints', empty=True) prepare_dir(folder + '/model_best', empty=True) prepare_dir(folder + '/frames', empty=True) lock = threading.Lock() test_lock = False if global_t == 0: test_lock = True last_temp_global_t = global_t ispretrain_markers = [False] * args.parallel_size num_demo_thread = 0 ctr_demo_thread = 0 def train_function(parallel_index): nonlocal global_t, pretrain_global_t, pretrain_epoch, \ rewards, test_lock, lock, \ last_temp_global_t, ispretrain_markers, num_demo_thread, \ ctr_demo_thread training_thread = training_threads[parallel_index] training_thread.set_summary_writer(summary_writer) # set all threads as demo threads training_thread.is_demo_thread = args.load_memory and args.use_demo_threads if training_thread.is_demo_thread or args.train_with_demo_num_steps > 0 or args.train_with_demo_num_epochs: training_thread.pretrain_init(demo_memory) if global_t == 0 and ( args.train_with_demo_num_steps > 0 or args.train_with_demo_num_epochs > 0) and parallel_index < 2: ispretrain_markers[parallel_index] = True training_thread.replay_mem_reset() # Pretraining with demo memory logger.info("t_idx={} pretrain starting".format(parallel_index)) while ispretrain_markers[parallel_index]: if stop_requested: return if pretrain_global_t > args.train_with_demo_num_steps and pretrain_epoch > args.train_with_demo_num_epochs: # At end of pretraining, reset state training_thread.replay_mem_reset() training_thread.episode_reward = 0 training_thread.local_t = 0 if args.use_lstm: training_thread.local_network.reset_state() ispretrain_markers[parallel_index] = False logger.info( "t_idx={} pretrain ended".format(parallel_index)) break diff_pretrain_global_t, _ = training_thread.demo_process( sess, pretrain_global_t) for _ in range(diff_pretrain_global_t): pretrain_global_t += 1 if pretrain_global_t % 10000 == 0: logger.debug( "pretrain_global_t={}".format(pretrain_global_t)) pretrain_epoch += 1 if pretrain_epoch % 1000 == 0: logger.debug("pretrain_epoch={}".format(pretrain_epoch)) # Waits for all threads to finish pretraining while not stop_requested and any(ispretrain_markers): time.sleep(0.01) # Evaluate model before training if not stop_requested and global_t == 0: with lock: if parallel_index == 0: test_reward, test_steps, test_episodes = training_threads[ 0].testing(sess, args.eval_max_steps, global_t, folder, demo_memory_cam=demo_memory_cam) rewards['eval'][global_t] = (test_reward, test_steps, test_episodes) saver.save( sess, folder + '/model_checkpoints/' + '{}_checkpoint'.format(args.gym_env.replace('-', '_')), global_step=global_t) save_best_model(test_reward) test_lock = False # all threads wait until evaluation finishes while not stop_requested and test_lock: time.sleep(0.01) # set start_time start_time = time.time() - wall_t training_thread.set_start_time(start_time) episode_end = True use_demo_thread = False while True: if stop_requested: return if global_t >= (args.max_time_step * args.max_time_step_fraction): return if args.use_demo_threads and global_t < args.max_steps_threads_as_demo and episode_end and num_demo_thread < 16: #if num_demo_thread < 2: demo_rate = 1.0 * (args.max_steps_threads_as_demo - global_t) / args.max_steps_threads_as_demo if demo_rate < 0.0333: demo_rate = 0.0333 if np.random.random() <= demo_rate and num_demo_thread < 16: ctr_demo_thread += 1 training_thread.replay_mem_reset(D_idx=ctr_demo_thread % num_demos) num_demo_thread += 1 logger.info( "idx={} as demo thread started ({}/16) rate={}".format( parallel_index, num_demo_thread, demo_rate)) use_demo_thread = True if use_demo_thread: diff_global_t, episode_end = training_thread.demo_process( sess, global_t) if episode_end: num_demo_thread -= 1 use_demo_thread = False logger.info("idx={} demo thread concluded ({}/16)".format( parallel_index, num_demo_thread)) else: diff_global_t, episode_end = training_thread.process( sess, global_t, rewards) for _ in range(diff_global_t): global_t += 1 if global_t % args.eval_freq == 0: temp_global_t = global_t lock.acquire() try: # catch multiple threads getting in at the same time if last_temp_global_t == temp_global_t: logger.info("Threading race problem averted!") continue test_lock = True test_reward, test_steps, n_episodes = training_thread.testing( sess, args.eval_max_steps, temp_global_t, folder, demo_memory_cam=demo_memory_cam) rewards['eval'][temp_global_t] = (test_reward, test_steps, n_episodes) if temp_global_t % ( (args.max_time_step * args.max_time_step_fraction) // 5) == 0: saver.save(sess, folder + '/model_checkpoints/' + '{}_checkpoint'.format( args.gym_env.replace('-', '_')), global_step=temp_global_t, write_meta_graph=False) if test_reward > best_model_reward: save_best_model(test_reward) test_lock = False last_temp_global_t = temp_global_t finally: lock.release() if global_t % ( (args.max_time_step * args.max_time_step_fraction) // 5) == 0: saver.save( sess, folder + '/model_checkpoints/' + '{}_checkpoint'.format(args.gym_env.replace('-', '_')), global_step=global_t, write_meta_graph=False) # all threads wait until evaluation finishes while not stop_requested and test_lock: time.sleep(0.01) def signal_handler(signal, frame): nonlocal stop_requested logger.info('You pressed Ctrl+C!') stop_requested = True if stop_requested and global_t == 0: sys.exit(1) def save_best_model(test_reward): nonlocal best_model_reward best_model_reward = test_reward with open(folder + '/model_best/best_model_reward', 'w') as f_best_model_reward: f_best_model_reward.write(str(best_model_reward)) best_saver.save( sess, folder + '/model_best/' + '{}_checkpoint'.format(args.gym_env.replace('-', '_'))) train_threads = [] for i in range(args.parallel_size): train_threads.append( threading.Thread(target=train_function, args=(i, ))) signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) # set start time start_time = time.time() - wall_t for t in train_threads: t.start() print('Press Ctrl+C to stop') for t in train_threads: t.join() logger.info('Now saving data. Please wait') # write wall time wall_t = time.time() - start_time wall_t_fname = folder + '/' + 'wall_t.' + str(global_t) with open(wall_t_fname, 'w') as f: f.write(str(wall_t)) with open(folder + '/pretrain_global_t', 'w') as f: f.write(str(pretrain_global_t)) root_saver.save( sess, folder + '/{}_checkpoint_a3c'.format(args.gym_env.replace('-', '_')), global_step=global_t) pickle.dump( rewards, open( folder + '/' + args.gym_env.replace('-', '_') + '-a3c-rewards.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) logger.info('Data saved!') sess.close()
training_threads = [] learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=options.rmsp_alpha, momentum=0.0, epsilon=options.rmsp_epsilon, clip_norm=options.grad_norm_clip, device=device) for i in range(options.parallel_size): training_thread = A3CTrainingThread(i, global_network, initial_learning_rate, learning_rate_input, grad_applier, options.max_time_step, device=device, options=options) training_threads.append(training_thread) # prepare session sess = tf.Session(config=tf.ConfigProto(log_device_placement=False, gpu_options={'allow_growth': True}, allow_soft_placement=True)) init = tf.global_variables_initializer() sess.run(init) # summary for tensorboard score_input = tf.placeholder(tf.int32)
def train(): #initial learning rate initial_learning_rate = log_uniform(INITIAL_ALPHA_LOW, INITIAL_ALPHA_HIGH, INITIAL_ALPHA_LOG_RATE) # parameter server and worker information ps_hosts = np.zeros(FLAGS.ps_hosts_num, dtype=object) worker_hosts = np.zeros(FLAGS.worker_hosts_num, dtype=object) port_num = FLAGS.st_port_num for i in range(FLAGS.ps_hosts_num): ps_hosts[i] = str(FLAGS.hostname) + ":" + str(port_num) port_num += 1 for i in range(FLAGS.worker_hosts_num): worker_hosts[i] = str(FLAGS.hostname) + ":" + str(port_num) port_num += 1 ps_hosts = list(ps_hosts) worker_hosts = list(worker_hosts) # Create a cluster from the parameter server and worker hosts. cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts}) # Create and start a server for the local task. server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) if FLAGS.job_name == "ps": server.join() elif FLAGS.job_name == "worker": device = tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % FLAGS.task_index, cluster=cluster) learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=RMSP_ALPHA, momentum=0.0, epsilon=RMSP_EPSILON, clip_norm=GRAD_NORM_CLIP, device=device) tf.set_random_seed(1) #There are no global network training_thread = A3CTrainingThread(0, "", initial_learning_rate, learning_rate_input, grad_applier, MAX_TIME_STEP, device=device, FLAGS=FLAGS, task_index=FLAGS.task_index) # prepare session with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % FLAGS.task_index, cluster=cluster)): # flag for task flag = tf.get_variable('flag', [], initializer=tf.constant_initializer(0), trainable=False) flag_ph = tf.placeholder(flag.dtype, shape=flag.get_shape()) flag_ops = flag.assign(flag_ph) # global step global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(0), trainable=False) global_step_ph = tf.placeholder(global_step.dtype, shape=global_step.get_shape()) global_step_ops = global_step.assign(global_step_ph) # score for tensorboard and score_set for genetic algorithm score = tf.get_variable('score', [], initializer=tf.constant_initializer(-21), trainable=False) score_ph = tf.placeholder(score.dtype, shape=score.get_shape()) score_ops = score.assign(score_ph) score_set = np.zeros(FLAGS.worker_hosts_num, dtype=object) score_set_ph = np.zeros(FLAGS.worker_hosts_num, dtype=object) score_set_ops = np.zeros(FLAGS.worker_hosts_num, dtype=object) for i in range(FLAGS.worker_hosts_num): score_set[i] = tf.get_variable( 'score' + str(i), [], initializer=tf.constant_initializer(-1000), trainable=False) score_set_ph[i] = tf.placeholder( score_set[i].dtype, shape=score_set[i].get_shape()) score_set_ops[i] = score_set[i].assign(score_set_ph[i]) # fixed path of earlier task fixed_path_tf = np.zeros((FLAGS.L, FLAGS.M), dtype=object) fixed_path_ph = np.zeros((FLAGS.L, FLAGS.M), dtype=object) fixed_path_ops = np.zeros((FLAGS.L, FLAGS.M), dtype=object) for i in range(FLAGS.L): for j in range(FLAGS.M): fixed_path_tf[i, j] = tf.get_variable( 'fixed_path' + str(i) + "-" + str(j), [], initializer=tf.constant_initializer(0), trainable=False) fixed_path_ph[i, j] = tf.placeholder( fixed_path_tf[i, j].dtype, shape=fixed_path_tf[i, j].get_shape()) fixed_path_ops[i, j] = fixed_path_tf[i, j].assign( fixed_path_ph[i, j]) # parameters on PathNet vars_ = training_thread.local_network.get_vars() vars_ph = np.zeros(len(vars_), dtype=object) vars_ops = np.zeros(len(vars_), dtype=object) for i in range(len(vars_)): vars_ph[i] = tf.placeholder(vars_[i].dtype, shape=vars_[i].get_shape()) vars_ops[i] = vars_[i].assign(vars_ph[i]) # initialization init_op = tf.global_variables_initializer() # summary for tensorboard tf.summary.scalar("score", score) summary_op = tf.summary.merge_all() saver = tf.train.Saver() sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0), global_step=global_step, logdir=FLAGS.log_dir, summary_op=summary_op, saver=saver, init_op=init_op) with sv.managed_session(server.target) as sess: if (FLAGS.task_index != (FLAGS.worker_hosts_num - 1)): for task in range(2): while True: if (sess.run([flag])[0] == (task + 1)): break time.sleep(2) # Set fixed_path fixed_path = np.zeros((FLAGS.L, FLAGS.M), dtype=float) for i in range(FLAGS.L): for j in range(FLAGS.M): if (sess.run([fixed_path_tf[i, j]])[0] == 1): fixed_path[i, j] = 1.0 training_thread.local_network.set_fixed_path(fixed_path) # set start_time wall_t = 0.0 start_time = time.time() - wall_t training_thread.set_start_time(start_time) while True: if sess.run([global_step])[0] > (MAX_TIME_STEP * (task + 1)): break diff_global_t = training_thread.process( sess, sess.run([global_step])[0], "", summary_op, "", score_ph, score_ops, "", FLAGS, score_set_ph[FLAGS.task_index], score_set_ops[FLAGS.task_index], score_set[FLAGS.task_index]) sess.run( global_step_ops, { global_step_ph: sess.run([global_step])[0] + diff_global_t }) else: fixed_path = np.zeros((FLAGS.L, FLAGS.M), dtype=float) vars_backup = np.zeros(len(vars_), dtype=object) vars_backup = sess.run(vars_) winner_idx = 0 for task in range(2): # Generating randomly geopath geopath_set = np.zeros(FLAGS.worker_hosts_num - 1, dtype=object) for i in range(FLAGS.worker_hosts_num - 1): geopath_set[i] = pathnet.get_geopath( FLAGS.L, FLAGS.M, FLAGS.N) tmp = np.zeros((FLAGS.L, FLAGS.M), dtype=float) for j in range(FLAGS.L): for k in range(FLAGS.M): if ((geopath_set[i][j, k] == 1.0) or (fixed_path[j, k] == 1.0)): tmp[j, k] = 1.0 pathnet.geopath_insert( sess, training_thread.local_network. geopath_update_placeholders_set[i], training_thread.local_network. geopath_update_ops_set[i], tmp, FLAGS.L, FLAGS.M) print("Geopath Setting Done") sess.run(flag_ops, {flag_ph: (task + 1)}) print("=============Task" + str(task + 1) + "============") score_subset = np.zeros(FLAGS.B, dtype=float) score_set_print = np.zeros(FLAGS.worker_hosts_num, dtype=float) rand_idx = range(FLAGS.worker_hosts_num - 1) np.random.shuffle(rand_idx) rand_idx = rand_idx[:FLAGS.B] while True: if sess.run([global_step])[0] > (MAX_TIME_STEP * (task + 1)): break flag_sum = 0 for i in range(FLAGS.worker_hosts_num - 1): score_set_print[i] = sess.run([score_set[i]])[0] print(score_set_print) for i in range(len(rand_idx)): score_subset[i] = sess.run( [score_set[rand_idx[i]]])[0] if (score_subset[i] == -1000): flag_sum = 1 break if (flag_sum == 0): winner_idx = rand_idx[np.argmax(score_subset)] print( str(sess.run([global_step])[0]) + " Step Score: " + str(sess.run([score_set[winner_idx]])[0])) for i in rand_idx: if (i != winner_idx): geopath_set[i] = np.copy( geopath_set[winner_idx]) geopath_set[i] = pathnet.mutation( geopath_set[i], FLAGS.L, FLAGS.M, FLAGS.N) tmp = np.zeros((FLAGS.L, FLAGS.M), dtype=float) for j in range(FLAGS.L): for k in range(FLAGS.M): if ((geopath_set[i][j, k] == 1.0) or (fixed_path[j, k] == 1.0)): tmp[j, k] = 1.0 pathnet.geopath_insert( sess, training_thread.local_network. geopath_update_placeholders_set[i], training_thread.local_network. geopath_update_ops_set[i], tmp, FLAGS.L, FLAGS.M) sess.run(score_set_ops[i], {score_set_ph[i]: -1000}) rand_idx = range(FLAGS.worker_hosts_num - 1) np.random.shuffle(rand_idx) rand_idx = rand_idx[:FLAGS.B] else: time.sleep(5) # fixed_path setting fixed_path = geopath_set[winner_idx] for i in range(FLAGS.L): for j in range(FLAGS.M): if (fixed_path[i, j] == 1.0): sess.run(fixed_path_ops[i, j], {fixed_path_ph[i, j]: 1}) training_thread.local_network.set_fixed_path(fixed_path) # initialization of parameters except fixed_path vars_idx = training_thread.local_network.get_vars_idx() for i in range(len(vars_idx)): if (vars_idx[i] == 1.0): sess.run(vars_ops[i], {vars_ph[i]: vars_backup[i]}) sv.stop() print("Done")
def train(): #initial learning rate pinitial_learning_rate = log_uniform(PINITIAL_ALPHA_LOW, PINITIAL_ALPHA_HIGH, INITIAL_ALPHA_LOG_RATE) vinitial_learning_rate = log_uniform(VINITIAL_ALPHA_LOW, VINITIAL_ALPHA_HIGH, INITIAL_ALPHA_LOG_RATE) # parameter server and worker information ps_hosts = np.zeros(FLAGS.ps_hosts_num, dtype=object) worker_hosts = np.zeros(FLAGS.worker_hosts_num, dtype=object) port_num = FLAGS.st_port_num for i in range(FLAGS.ps_hosts_num): ps_hosts[i] = str(FLAGS.hostname) + ":" + str(port_num) port_num += 1 for i in range(FLAGS.worker_hosts_num): worker_hosts[i] = str(FLAGS.hostname) + ":" + str(port_num) port_num += 1 ps_hosts = list(ps_hosts) worker_hosts = list(worker_hosts) # Create a cluster from the parameter server and worker hosts. cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts}) # Create and start a server for the local task. server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) if FLAGS.job_name == "ps": server.join() elif FLAGS.job_name == "worker": device = tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % FLAGS.task_index, cluster=cluster) plearning_rate_input = tf.placeholder("float") vlearning_rate_input = tf.placeholder("float") pgrad_applier = RMSPropApplier(learning_rate=plearning_rate_input, decay=RMSP_ALPHA, momentum=0.0, epsilon=RMSP_EPSILON, clip_norm=GRAD_NORM_CLIP, device=device) vgrad_applier = RMSPropApplier(learning_rate=vlearning_rate_input, decay=RMSP_ALPHA, momentum=0.0, epsilon=RMSP_EPSILON, clip_norm=GRAD_NORM_CLIP, device=device) tf.set_random_seed(1) #There are no global network training_thread = A3CTrainingThread(0, "", pinitial_learning_rate, plearning_rate_input, pgrad_applier, vinitial_learning_rate, vlearning_rate_input, vgrad_applier, MAX_TIME_STEP, device=device, task_index=FLAGS.task_index) # prepare session with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % FLAGS.task_index, cluster=cluster)): global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(0), trainable=False) global_step_ph = tf.placeholder(global_step.dtype, shape=global_step.get_shape()) global_step_ops = global_step.assign(global_step_ph) score = tf.get_variable('score', [], initializer=tf.constant_initializer(-21), trainable=False) score_ph = tf.placeholder(score.dtype, shape=score.get_shape()) score_ops = score.assign(score_ph) init_op = tf.global_variables_initializer() # summary for tensorboard tf.summary.scalar("score", score) summary_op = tf.summary.merge_all() saver = tf.train.Saver() sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0), global_step=global_step, logdir=LOG_FILE, summary_op=summary_op, saver=saver, init_op=init_op) with sv.managed_session(server.target) as sess: # set start_time wall_t = 0.0 start_time = time.time() - wall_t training_thread.set_start_time(start_time) local_t = 0 while True: if sess.run([global_step])[0] > MAX_TIME_STEP: break diff_global_t = training_thread.process( sess, sess.run([global_step])[0], "", summary_op, "", score_ph, score_ops) sess.run(global_step_ops, { global_step_ph: sess.run([global_step])[0] + diff_global_t }) local_t += diff_global_t sv.stop() print("Done")
training_threads = [] learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate=learning_rate_input, decay=RMSP_ALPHA, momentum=0.0, epsilon=RMSP_EPSILON, clip_norm=GRAD_NORM_CLIP, device=device) for i in range(PARALLEL_SIZE): training_thread = A3CTrainingThread(i, global_network, initial_learning_rate, learning_rate_input, grad_applier, MAX_TIME_STEP, device=device) training_threads.append(training_thread) # prepare session sess = tf.Session(config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)) init = tf.global_variables_initializer() sess.run(init) # summary for tensorboard score_input = tf.placeholder(tf.int32) tf.summary.scalar("score", score_input)
def train(): #initial learning rate initial_learning_rate = log_uniform(INITIAL_ALPHA_LOW, INITIAL_ALPHA_HIGH, INITIAL_ALPHA_LOG_RATE) # parameter server and worker information ps_hosts = np.zeros(FLAGS.ps_hosts_num,dtype=object); worker_hosts = np.zeros(FLAGS.worker_hosts_num,dtype=object); port_num=FLAGS.st_port_num; for i in range(FLAGS.ps_hosts_num): ps_hosts[i]=str(FLAGS.hostname)+":"+str(port_num); port_num+=1; for i in range(FLAGS.worker_hosts_num): worker_hosts[i]=str(FLAGS.hostname)+":"+str(port_num); port_num+=1; ps_hosts=list(ps_hosts); worker_hosts=list(worker_hosts); # Create a cluster from the parameter server and worker hosts. cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts}) # Create and start a server for the local task. server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) if FLAGS.job_name == "ps": server.join(); elif FLAGS.job_name == "worker": # gpu_assignment = FLAGS.task_index % NUM_GPUS # print("Assigning worker #%d to GPU #%d" % (FLAGS.task_index, gpu_assignment)) # device=tf.train.replica_device_setter( # worker_device="/job:worker/task:%d/gpu:%d" % (FLAGS.task_index, gpu_assignment), # cluster=cluster); device=tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % FLAGS.task_index, cluster=cluster); learning_rate_input = tf.placeholder("float") grad_applier = RMSPropApplier(learning_rate = learning_rate_input, decay = RMSP_ALPHA, momentum = 0.0, epsilon = RMSP_EPSILON, clip_norm = GRAD_NORM_CLIP, device = device) tf.set_random_seed(1); #There are no global network #lock = multiprocessing.Lock() #wrapper = ToDiscrete('constant-7') #env = wrapper(gym.make('gym_doom/DoomBasic-v0')) #env.close() training_thread = A3CTrainingThread(0,"",0,initial_learning_rate,learning_rate_input,grad_applier,MAX_TIME_STEP,device=device,FLAGS=FLAGS,task_index=FLAGS.task_index) # prepare session with tf.device(device): # flag for task flag = tf.get_variable('flag',[],initializer=tf.constant_initializer(0),trainable=False); flag_ph=tf.placeholder(flag.dtype,shape=flag.get_shape()); flag_ops=flag.assign(flag_ph); # global step global_step = tf.get_variable('global_step',[],initializer=tf.constant_initializer(0),trainable=False); global_step_ph=tf.placeholder(global_step.dtype,shape=global_step.get_shape()); global_step_ops=global_step.assign(global_step_ph); # score for tensorboard and score_set for genetic algorithm score = tf.get_variable('score',[],initializer=tf.constant_initializer(-21),trainable=False); score_ph=tf.placeholder(score.dtype,shape=score.get_shape()); score_ops=score.assign(score_ph); score_set=np.zeros(FLAGS.worker_hosts_num,dtype=object); score_set_ph=np.zeros(FLAGS.worker_hosts_num,dtype=object); score_set_ops=np.zeros(FLAGS.worker_hosts_num,dtype=object); for i in range(FLAGS.worker_hosts_num): score_set[i] = tf.get_variable('score'+str(i),[],initializer=tf.constant_initializer(-1000),trainable=False); score_set_ph[i]=tf.placeholder(score_set[i].dtype,shape=score_set[i].get_shape()); score_set_ops[i]=score_set[i].assign(score_set_ph[i]); # fixed path of earlier task fixed_path_tf=np.zeros((FLAGS.L,FLAGS.M),dtype=object); fixed_path_ph=np.zeros((FLAGS.L,FLAGS.M),dtype=object); fixed_path_ops=np.zeros((FLAGS.L,FLAGS.M),dtype=object); for i in range(FLAGS.L): for j in range(FLAGS.M): fixed_path_tf[i,j]=tf.get_variable('fixed_path'+str(i)+"-"+str(j),[],initializer=tf.constant_initializer(0),trainable=False); fixed_path_ph[i,j]=tf.placeholder(fixed_path_tf[i,j].dtype,shape=fixed_path_tf[i,j].get_shape()); fixed_path_ops[i,j]=fixed_path_tf[i,j].assign(fixed_path_ph[i,j]); # parameters on PathNet vars_=training_thread.local_network.get_vars(); vars_ph=np.zeros(len(vars_),dtype=object); vars_ops=np.zeros(len(vars_),dtype=object); for i in range(len(vars_)): vars_ph[i]=tf.placeholder(vars_[i].dtype,shape=vars_[i].get_shape()); vars_ops[i]=vars_[i].assign(vars_ph[i]); # initialization init_op=tf.global_variables_initializer(); # summary for tensorboard tf.summary.scalar("score", score); summary_op = tf.summary.merge_all() saver = tf.train.Saver(); sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0), global_step=global_step, logdir=FLAGS.log_dir, summary_op=summary_op, saver=saver, init_op=init_op) try: os.mkdir("./data/graphs") except: pass # config = tf.ConfigProto( # device_count = {'GPU': 0} # ) # config = tf.ConfigProto() # config.gpu_options.allow_growth = True # config.gpu_options.per_process_gpu_memory_fraction = 0.1 with sv.managed_session(server.target) as sess: if(FLAGS.task_index!=(FLAGS.worker_hosts_num-1)): for task in range(2): training_thread.set_training_stage(task) while sess.run([flag])[0] != (task+1): time.sleep(2) # Set fixed_path fixed_path=np.zeros((FLAGS.L,FLAGS.M),dtype=float); for i in range(FLAGS.L): for j in range(FLAGS.M): if(sess.run([fixed_path_tf[i,j]])[0]==1): fixed_path[i,j]=1.0; training_thread.local_network.set_fixed_path(fixed_path); # set start_time wall_t=0.0; start_time = time.time() - wall_t training_thread.set_start_time(start_time) while True: if sess.run([global_step])[0] > (MAX_TIME_STEP*(task+1)): break diff_global_t = training_thread.process(sess, sess.run([global_step])[0], "", summary_op, "",score_ph,score_ops,"",FLAGS,score_set_ph[FLAGS.task_index],score_set_ops[FLAGS.task_index]) sess.run(global_step_ops,{global_step_ph:sess.run([global_step])[0]+diff_global_t}); else: fixed_path=np.zeros((FLAGS.L,FLAGS.M),dtype=float) vars_backup=np.zeros(len(vars_),dtype=object) vars_backup=sess.run(vars_) winner_idx=0 vis = visualize.GraphVisualize([FLAGS.M] * FLAGS.L, True) for task in range(2): # Generating randomly geopath geopath_set=np.zeros(FLAGS.worker_hosts_num-1,dtype=object); for i in range(FLAGS.worker_hosts_num-1): geopath_set[i]=pathnet.get_geopath(FLAGS.L,FLAGS.M,FLAGS.N); tmp=np.zeros((FLAGS.L,FLAGS.M),dtype=float); for j in range(FLAGS.L): for k in range(FLAGS.M): if((geopath_set[i][j,k]==1.0)or(fixed_path[j,k]==1.0)): tmp[j,k]=1.0; pathnet.geopath_insert(sess,training_thread.local_network.geopath_update_placeholders_set[i],training_thread.local_network.geopath_update_ops_set[i],tmp,FLAGS.L,FLAGS.M); print("Geopath Setting Done"); sess.run(flag_ops,{flag_ph:(task+1)}); print("=============Task "+str(task+1)+"============"); score_subset=np.zeros(FLAGS.B,dtype=float); score_set_print=np.zeros(FLAGS.worker_hosts_num,dtype=float); rand_idx=np.arange(FLAGS.worker_hosts_num-1); np.random.shuffle(rand_idx); rand_idx=rand_idx[:FLAGS.B]; while sess.run([global_step])[0] <= (MAX_TIME_STEP*(task+1)): # if (sess.run([global_step])[0]) % 1000 == 0: # print("Saving summary...") # tf.logging.info('Running Summary operation on the chief.') # summary_str = sess.run(summary_op) # sv.summary_computed(sess, summary_str) # tf.logging.info('Finished running Summary operation.') # # # Determine the next time for running the summary. decodePath = lambda p: [np.where(l==1.0)[0] for l in p] flag_sum=0; for i in range(FLAGS.worker_hosts_num-1): score_set_print[i]=sess.run([score_set[i]])[0]; for i in range(len(rand_idx)): score_subset[i]=sess.run([score_set[rand_idx[i]]])[0]; if(score_subset[i]==-1000): flag_sum=1; break; if(flag_sum==0): vispaths = [np.array(decodePath(p)) for p in geopath_set] vis.show(vispaths, 'm') winner_idx=rand_idx[np.argmax(score_subset)]; print(str(sess.run([global_step])[0])+" Step Score: "+str(sess.run([score_set[winner_idx]])[0])); for i in rand_idx: if(i!=winner_idx): geopath_set[i]=np.copy(geopath_set[winner_idx]); geopath_set[i]=pathnet.mutation(geopath_set[i],FLAGS.L,FLAGS.M,FLAGS.N); tmp=np.zeros((FLAGS.L,FLAGS.M),dtype=float); for j in range(FLAGS.L): for k in range(FLAGS.M): if((geopath_set[i][j,k]==1.0)or(fixed_path[j,k]==1.0)): tmp[j,k]=1.0; pathnet.geopath_insert(sess,training_thread.local_network.geopath_update_placeholders_set[i],training_thread.local_network.geopath_update_ops_set[i],tmp,FLAGS.L,FLAGS.M); sess.run(score_set_ops[i],{score_set_ph[i]:-1000}) rand_idx=np.arange(FLAGS.worker_hosts_num-1) np.random.shuffle(rand_idx) rand_idx=rand_idx[:FLAGS.B] else: time.sleep(2); # fixed_path setting fixed_path=geopath_set[winner_idx] vis.set_fixed(decodePath(fixed_path), 'r' if task == 0 else 'g') vis.show(vispaths, 'm') print('fix') for i in range(FLAGS.L): for j in range(FLAGS.M): if(fixed_path[i,j]==1.0): sess.run(fixed_path_ops[i,j],{fixed_path_ph[i,j]:1}); training_thread.local_network.set_fixed_path(fixed_path); # backup fixed vars # FIXED_VARS_BACKUP = training_thread.local_network.get_fixed_vars(); # FIXED_VARS_IDX_BACKUP = training_thread.local_network.get_fixed_vars_idx(); # initialization of parameters except fixed_path vars_idx=training_thread.local_network.get_vars_idx(); for i in range(len(vars_idx)): if(vars_idx[i]==1.0): sess.run(vars_ops[i],{vars_ph[i]:vars_backup[i]}); vis.waitForButtonPress() sv.stop();