def main(args): args.batch_size = None logger.debug('CONFIGURATION: {}'.format(args)) """ Set up the graph, the agents, and run the agents in parallel. """ if args.env == 'GYM': from environments import atari_environment num_actions = atari_environment.get_num_actions(args.game) input_shape = atari_environment.get_input_shape(args.game) else: num_actions = get_num_actions(args.rom_path, args.game) args.summ_base_dir = '/tmp/summary_logs/{}/{}'.format( args.game, time.strftime('%m.%d/%H.%M')) logger.info('logging summaries to {}'.format(args.summ_base_dir)) Learner, Network = ALGORITHMS[args.alg_type] network = Network({ 'name': 'shared_vars_network', 'input_shape': input_shape, 'num_act': num_actions, 'args': args }) #initialize shared variables args.learning_vars = SharedVars(network) args.opt_state = SharedVars( network, opt_type=args.opt_type, lr=args.initial_lr) if args.opt_mode == 'shared' else None args.batch_opt_state = SharedVars( network, opt_type=args.opt_type, lr=args.initial_lr) if args.opt_mode == 'shared' else None if args.alg_type in ['q', 'sarsa', 'dueling', 'dqn-cts']: args.target_vars = SharedVars(network) args.target_update_flags = SharedFlags(args.num_actor_learners) args.barrier = Barrier(args.num_actor_learners) args.episode_counter = SharedCounter(0) args.global_step = SharedCounter(0) args.num_actions = num_actions #spin up processes and block if (args.visualize == 2): args.visualize = 0 actor_learners = [] experience_queue = Queue() for i in xrange(args.num_actor_learners): if (args.visualize == 2) and (i == args.num_actor_learners - 1): args.args.visualize = 1 args.actor_id = i rng = np.random.RandomState(int(time.time())) args.random_seed = rng.randint(1000) #pass in gpu name to learner here and wrap each learner in device context args.queue = experience_queue #only used by TRPO args.input_shape = input_shape actor_learners.append(Learner(args)) actor_learners[-1].start() for t in actor_learners: t.join() logger.info('All training threads finished!')
def main(args): args.batch_size = None logger.debug('CONFIGURATION: {}'.format(args)) """ Set up the graph, the agents, and run the agents in parallel. """ if args.env == 'GYM': from environments import atari_environment num_actions, action_space, _ = atari_environment.get_actions(args.game) input_shape = atari_environment.get_input_shape(args.game) elif args.env == 'DOOM': from environments.vizdoom_env import VizDoomEnv env = VizDoomEnv(args.doom_cfg, args.game, args.is_train) num_actions, action_space = env.get_actions() input_shape = env.get_input_shape() else: num_actions = get_num_actions(args.rom_path, args.game) args.action_space = action_space args.summ_base_dir = '/tmp/summary_logs/{}/{}'.format( args.game, time.strftime('%m.%d/%H.%M')) logger.info('logging summaries to {}'.format(args.summ_base_dir)) Learner, Network = ALGORITHMS[args.alg_type] network = Network({ 'name': 'shared_vars_network', 'input_shape': input_shape, 'num_act': num_actions, 'args': args }) args.network = Network #initialize shared variables args.learning_vars = SharedVars(network.params) args.opt_state = SharedVars( network.params, opt_type=args.opt_type, lr=args.initial_lr) if args.opt_mode == 'shared' else None args.batch_opt_state = SharedVars( network.params, opt_type=args.opt_type, lr=args.initial_lr) if args.opt_mode == 'shared' else None #TODO: need to refactor so TRPO+GAE doesn't need special treatment if args.alg_type in ['trpo', 'trpo-continuous']: if args.arch == 'FC': #add timestep feature vf_input_shape = [input_shape[0] + 1] else: vf_input_shape = input_shape baseline_network = PolicyValueNetwork( { 'name': 'shared_value_network', 'input_shape': vf_input_shape, 'num_act': num_actions, 'args': args }, use_policy_head=False) args.baseline_vars = SharedVars(baseline_network.params) args.vf_input_shape = vf_input_shape if args.alg_type in ['q', 'sarsa', 'dueling', 'dqn-cts']: args.target_vars = SharedVars(network.params) args.target_update_flags = SharedFlags(args.num_actor_learners) if args.alg_type == 'dqn-cts': args.density_model_update_flags = SharedFlags(args.num_actor_learners) tf.reset_default_graph() args.barrier = Barrier(args.num_actor_learners) args.global_step = SharedCounter(0) args.num_actions = num_actions cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES') num_gpus = 0 if cuda_visible_devices: num_gpus = len(cuda_visible_devices.split()) #spin up processes and block if (args.visualize == 2): args.visualize = 0 actor_learners = [] task_queue = Queue() experience_queue = Queue() seed = args.seed or np.random.randint(2**32) np.random.seed(seed) tf.set_random_seed(seed) for i in xrange(args.num_actor_learners): if (args.visualize == 2) and (i == args.num_actor_learners - 1): args.args.visualize = 1 args.actor_id = i args.device = '/gpu:{}'.format(i % num_gpus) if num_gpus else '/cpu:0' args.random_seed = seed + i #only used by TRPO args.task_queue = task_queue args.experience_queue = experience_queue args.input_shape = input_shape actor_learners.append(Learner(args)) actor_learners[-1].start() try: for t in actor_learners: t.join() except KeyboardInterrupt: #Terminate with extreme prejudice for t in actor_learners: t.terminate() logger.info('All training threads finished!') logger.info('Use seed={} to reproduce'.format(seed))
def main(args): args.batch_size = None logger.debug('CONFIGURATION: {}'.format(args)) """ Set up the graph, the agents, and run the agents in parallel. """ if args.env == 'GYM': from environments import atari_environment num_actions = atari_environment.get_num_actions(args.game) input_shape = atari_environment.get_input_shape(args.game) else: num_actions = get_num_actions(args.rom_path, args.game) args.summ_base_dir = '/tmp/summary_logs/{}/{}'.format( args.game, time.strftime('%m.%d/%H.%M')) logger.info('logging summaries to {}'.format(args.summ_base_dir)) algorithms = { 'q': (NStepQLearner, QNetwork), 'sarsa': (OneStepSARSALearner, QNetwork), 'dueling': (DuelingLearner, DuelingNetwork), 'a3c': (A3CLearner, PolicyValueNetwork), 'a3c-lstm': (A3CLSTMLearner, PolicyValueNetwork), 'a3c-sequence-decoder': (ActionSequenceA3CLearner, SequencePolicyVNetwork), 'pgq': (PGQLearner, PolicyValueNetwork), 'pgq-lstm': (PGQLSTMLearner, PolicyValueNetwork), 'trpo': (TRPOLearner, PolicyNetwork), 'cem': (CEMLearner, PolicyNetwork), 'q-cts': (PseudoCountQLearner, QNetwork), 'a3c-cts': (PseudoCountA3CLearner, PolicyValueNetwork), 'a3c-repeat': (ARA3CLearner, PolicyRepeatNetwork), } assert args.alg_type in algorithms, 'alg_type `{}` not implemented'.format( args.alg_type) Learner, Network = algorithms[args.alg_type] network = Network({ 'name': 'shared_vars_network', 'input_shape': input_shape, 'num_act': num_actions, 'args': args }) args.learning_vars = SharedVars(num_actions, args.alg_type, network) args.opt_state = SharedVars( num_actions, args.alg_type, network, opt_type=args.opt_type, lr=args.initial_lr) if args.opt_mode == 'shared' else None args.batch_opt_state = SharedVars( num_actions, args.alg_type, network, opt_type=args.opt_type, lr=args.initial_lr) if args.opt_mode == 'shared' else None if args.alg_type in ['q', 'sarsa', 'dueling', 'q-cts']: args.target_vars = SharedVars(num_actions, args.alg_type, network) args.target_update_flags = SharedFlags(args.num_actor_learners) args.barrier = Barrier(args.num_actor_learners) args.episode_counter = SharedCounter(0) args.global_step = SharedCounter(0) args.num_actions = num_actions if (args.visualize == 2): args.visualize = 0 actor_learners = [] experience_queue = Queue() for i in xrange(args.num_actor_learners): if (args.visualize == 2) and (i == args.num_actor_learners - 1): args.args.visualize = 1 args.actor_id = i rng = np.random.RandomState(int(time.time())) args.random_seed = rng.randint(1000) #pass in gpu name to learner here and wrap each learner in device context args.queue = experience_queue #only used by TRPO args.input_shape = input_shape actor_learners.append(Learner(args)) actor_learners[-1].start() for t in actor_learners: t.join() logger.info('All training threads finished!')
def main(args): args.batch_size = None logger.debug('CONFIGURATION: {}'.format(args)) """ Set up the graph, the agents, and run the agents in parallel. """ if args.env == 'GYM': from environments import atari_environment num_actions, action_space, _ = atari_environment.get_actions(args.game) input_shape = atari_environment.get_input_shape(args.game) else: num_actions = get_num_actions(args.rom_path, args.game) args.action_space = action_space args.summ_base_dir = '/tmp/summary_logs/{}/{}'.format(args.game, time.strftime('%m.%d/%H.%M')) logger.info('logging summaries to {}'.format(args.summ_base_dir)) Learner, Network = ALGORITHMS[args.alg_type] #print("Learner is: {}".format(Learner)) if args.alg_type !='AE': network = Network({ 'name': 'shared_vars_network', 'input_shape': input_shape, 'num_act': num_actions, 'args': args }) args.network = Network else: network_lower = Network({ 'name': 'shared_vars_network_lower', 'input_shape': input_shape, 'num_act': num_actions, 'args': args }) args.network_lower = Network network_upper = Network({ 'name': 'shared_vars_network_upper', 'input_shape': input_shape, 'num_act': num_actions, 'args': args }) args.network_upper = Network ## initialize visdom server args.visdom = visdom.Visdom(port=args.display_port, env='AE DQN') #initialize shared variables #TODO: !!!!!! only network lower params are being use, should check out if upper is also needed !!!!!!! if args.alg_type !='AE': args.learning_vars = SharedVars(network.params) #size, step and optimizer args.opt_state = SharedVars( network.params, opt_type=args.opt_type, lr=args.initial_lr ) if args.opt_mode == 'shared' else None args.batch_opt_state = SharedVars( network.params, opt_type=args.opt_type, lr=args.initial_lr ) if args.opt_mode == 'shared' else None else: #args.learning_vars = SharedVars(network_lower.params) #size, step and optimizer args.learning_vars_lower = SharedVars(network_lower.params) #size, step and optimizer args.learning_vars_upper = SharedVars(network_upper.params) #size, step and optimizer args.opt_state_lower = SharedVars( network_lower.params, opt_type=args.opt_type, lr=args.initial_lr ) args.opt_state_upper = SharedVars( network_upper.params, opt_type=args.opt_type, lr=args.initial_lr ) if args.opt_mode == 'shared' else None args.batch_opt_state_lower = SharedVars( network_lower.params, opt_type=args.opt_type, lr=args.initial_lr ) args.batch_opt_state_uppper = SharedVars( network_upper.params, opt_type=args.opt_type, lr=args.initial_lr ) if args.opt_mode == 'shared' else None #TODO: need to refactor so TRPO+GAE doesn't need special treatment if args.alg_type in ['trpo', 'trpo-continuous']: if args.arch == 'FC': #add timestep feature vf_input_shape = [input_shape[0]+1] else: vf_input_shape = input_shape baseline_network = PolicyValueNetwork({ 'name': 'shared_value_network', 'input_shape': vf_input_shape, 'num_act': num_actions, 'args': args }, use_policy_head=False) args.baseline_vars = SharedVars(baseline_network.params) args.vf_input_shape = vf_input_shape if args.alg_type in ['q', 'sarsa', 'dueling', 'dqn-cts']: args.target_vars = SharedVars(network.params) args.target_update_flags = SharedFlags(args.num_actor_learners) if args.alg_type in ['dqn-cts', 'a3c-cts', 'a3c-lstm-cts']: #TODO check density_model_update_flags args.density_model_update_flags = SharedFlags(args.num_actor_learners) if args.alg_type in ['AE']: #print("we are in main args.alg_type in [AE]") args.target_vars_lower = SharedVars(network_lower.params) args.target_vars_upper = SharedVars(network_upper.params) args.target_update_flags = SharedFlags(args.num_actor_learners) args.density_model_update_flags = SharedFlags(args.num_actor_learners) tf.reset_default_graph() args.barrier = Barrier(args.num_actor_learners) args.global_step = SharedCounter(0) #ars.shared_visualizer = Visualizer(args.num_actor_learners) ## TODO to make it shared between the processes args.num_actions = num_actions cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES') num_gpus = 0 if cuda_visible_devices: num_gpus = len(cuda_visible_devices.split()) #spin up processes and block # if (args.visualize == 2): args.visualize = 0 actor_learners = [] task_queue = Queue() experience_queue = Queue() seed = args.seed or np.random.randint(2**32) np.random.seed(seed) tf.set_random_seed(seed) visualize = args.visualize for i in range(args.num_actor_learners): if (visualize == 2) and (i == args.num_actor_learners - 1): args.visualize = 1 else: args.visualize = 0 args.actor_id = i args.device = '/gpu:{}'.format(i % num_gpus) if num_gpus else '/cpu:0' args.random_seed = seed + i #only used by TRPO args.task_queue = task_queue args.experience_queue = experience_queue args.input_shape = input_shape actor_learners.append(Learner(args)) actor_learners[-1].start() if i == 1: setup_kill_signal_handler(actor_learners[-1]) try: for t in actor_learners: file_name = "myfile_"+str(t) with open("grpah", 'w') as file_name: wr = csv.writer(file_name, quoting=csv.QUOTE_ALL) wr.writerow(t.vis.plot_data['X']) wr.writerow(t.vis.plot_data['Y']) print ('[%s]' % ', '.join(map(str, t.vis.plot_data['X']))) t.join() except KeyboardInterrupt: #Terminate with extreme prejudice for t in actor_learners: t.terminate() logger.info('All training threads finished!') logger.info('Use seed={} to reproduce'.format(seed))