def main(): parser = argparse.ArgumentParser( description='Train using Gazebo Simulations') parser.add_argument('--seed', default=10, type=int, help='Random seed') parser.add_argument('--input_shape', default=(80, 100), help='Input shape') parser.add_argument('--gamma', default=0.99, help='Discount factor') parser.add_argument('--epsilon', default=0.1, help='Exploration probability in epsilon-greedy') parser.add_argument('--learning_rate', default=0.00001, help='learning rate') parser.add_argument('--window_size', default=4, type=int, help='Number of frames to feed to the Q-network') parser.add_argument('--num_time', default=4, type=int, help='Number of steps in RNN') parser.add_argument('--num_actions', default=7, type=int, help='Number of actions') parser.add_argument('--batch_size', default=64, type=int, help='Batch size of the training part') parser.add_argument('--num_iteration', default=500000, type=int, help='number of iterations to train') parser.add_argument( '--eval_every', default=0.01, type=float, help='What fraction of num_iteration to run between evaluations') args = parser.parse_args() random.seed(args.seed) np.random.seed(args.seed) tf.set_random_seed(args.seed) batch_environment = GazeboWorld() print('Environment initialized') replay_memory = ReplayMemory(REPLAYMEMORY_SIZE, args.window_size, args.input_shape) online_model, online_params = create_model(args.window_size, args.input_shape, args.num_actions, 'online_model', create_duel_q_network, trainable=True) target_model, target_params = create_model(args.window_size, args.input_shape, args.num_actions, 'target_model', create_duel_q_network, trainable=False) update_target_params_ops = [ t.assign(s) for s, t in zip(online_params, target_params) ] agent = DQNAgent(online_model, target_model, replay_memory, args.num_actions, args.gamma, TARGET_UPDATE_FREQENCY, update_target_params_ops, args.batch_size, args.learning_rate) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) with sess.as_default(): # saving and loading networks trainables = tf.trainable_variables() trainable_saver = tf.train.Saver(trainables, max_to_keep=1) sess.run(tf.global_variables_initializer()) checkpoint = tf.train.get_checkpoint_state("saved_networks") print('checkpoint:', checkpoint) if checkpoint and checkpoint.model_checkpoint_path: trainable_saver.restore(sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old network weights") # make target_model equal to online_model sess.run(update_target_params_ops) print('Prepare fixed samples for mean max Q.') fixed_samples = get_fixed_samples(batch_environment, args.num_actions, NUM_FIXED_SAMPLES) # initialize replay buffer print('Burn in replay_memory.') agent.fit(sess, batch_environment, NUM_BURN_IN, do_train=False) # start training: fit_iteration = int(args.num_iteration * args.eval_every) for i in range(0, args.num_iteration, fit_iteration): # evaluate: reward_mean, reward_var, reward_max, reward_min, reward = agent.evaluate( sess, batch_environment) mean_max_Q1, mean_max_Q2 = agent.get_mean_max_Q( sess, fixed_samples) print("%d, %f, %f, %f, %f, %f, %f" % (i, mean_max_Q1, mean_max_Q2, reward_mean, reward_var, reward_max, reward_min)) # train: agent.fit(sess, batch_environment, fit_iteration, do_train=True) trainable_saver.save(sess, 'saved_networks/', global_step=i) reward_mean, reward_var, reward_max, reward_min, reward = agent.evaluate( sess, batch_environment) mean_max_Q1, mean_max_Q2 = agent.get_mean_max_Q(sess, fixed_samples) print("%d, %f, %f, %f, %f, %f, %f" % (i, mean_max_Q1, mean_max_Q2, reward_mean, reward_var, reward_max, reward_min))
def main(): parser = argparse.ArgumentParser( description='Run DQN on Atari Space Invaders') parser.add_argument('--env', default='SpaceInvaders-v0', help='Atari env name') parser.add_argument('--seed', default=10703, type=int, help='Random seed') parser.add_argument('--input_shape', default=(84, 84), help='Input shape') parser.add_argument('--gamma', default=0.99, help='Discount factor') parser.add_argument('--epsilon', default=0.1, help='Exploration probability in epsilon-greedy') parser.add_argument('--learning_rate', default=0.00025, help='Training learning rate.') parser.add_argument('--window_size', default=4, type=int, help='Number of frames to feed to the Q-network') parser.add_argument('--batch_size', default=32, type=int, help='Batch size of the training part') parser.add_argument('--num_process', default=3, type=int, help='Number of parallel environment') parser.add_argument('--num_iteration', default=20000000, type=int, help='number of iterations to train') parser.add_argument( '--eval_every', default=0.001, type=float, help='What fraction of num_iteration to run between evaluations.') parser.add_argument('--is_duel', default=1, type=int, help='Whether use duel DQN, 0 means no, 1 means yes.') parser.add_argument( '--is_double', default=1, type=int, help='Whether use double DQN, 0 means no, 1 means yes.') parser.add_argument( '--is_per', default=1, type=int, help='Whether use PriorityExperienceReplay, 0 means no, 1 means yes.') parser.add_argument( '--is_distributional', default=1, type=int, help='Whether use distributional DQN, 0 means no, 1 means yes.') parser.add_argument('--num_step', default=1, type=int, help='Num Step for multi-step DQN, 3 is recommended') parser.add_argument('--is_noisy', default=1, type=int, help='Whether use NoisyNet, 0 means no, 1 means yes.') args = parser.parse_args() args.input_shape = tuple(args.input_shape) print('Environment: %s.' % (args.env, )) env = gym.make(args.env) num_actions = env.action_space.n print('number_actions: %d.' % (num_actions, )) env.close() random.seed(args.seed) np.random.seed(args.seed) tf.set_random_seed(args.seed) batch_environment = BatchEnvironment(args.env, args.num_process, args.window_size, args.input_shape, NUM_FRAME_PER_ACTION, MAX_EPISODE_LENGTH) if args.is_per == 1: replay_memory = PriorityExperienceReplay(REPLAYMEMORY_SIZE, args.window_size, args.input_shape) else: replay_memory = ReplayMemory(REPLAYMEMORY_SIZE, args.window_size, args.input_shape) create_network_fn = create_deep_q_network if args.is_duel == 0 else create_duel_q_network create_model_fn = create_model if args.is_distributional == 0 else create_distributional_model noisy = True if args.is_noisy == 1 else False online_model, online_params = create_model_fn(args.window_size, args.input_shape, num_actions, 'online_model', create_network_fn, trainable=True, noisy=noisy) target_model, target_params = create_model_fn(args.window_size, args.input_shape, num_actions, 'target_model', create_network_fn, trainable=False, noisy=noisy) update_target_params_ops = [ t.assign(s) for s, t in zip(online_params, target_params) ] agent = DQNAgent(online_model, target_model, replay_memory, num_actions, args.gamma, UPDATE_FREQUENCY, TARGET_UPDATE_FREQENCY, update_target_params_ops, args.batch_size, args.is_double, args.is_per, args.is_distributional, args.num_step, args.is_noisy, args.learning_rate, RMSP_DECAY, RMSP_MOMENTUM, RMSP_EPSILON) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.4) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) with sess.as_default(): sess.run(tf.global_variables_initializer()) # make target_model equal to online_model sess.run(update_target_params_ops) print('Prepare fixed samples for mean max Q.') fixed_samples = get_fixed_samples(batch_environment, num_actions, NUM_FIXED_SAMPLES) print('Burn in replay_memory.') agent.fit(sess, batch_environment, NUM_BURN_IN, do_train=False) # Begin to train: fit_iteration = int(args.num_iteration * args.eval_every) for i in range(0, args.num_iteration, fit_iteration): # Evaluate: reward_mean, reward_var = agent.evaluate(sess, batch_environment, NUM_EVALUATE_EPSIODE) mean_max_Q = agent.get_mean_max_Q(sess, fixed_samples) print("%d, %f, %f, %f" % (i, mean_max_Q, reward_mean, reward_var)) # Train: agent.fit(sess, batch_environment, fit_iteration, do_train=True) batch_environment.close()