Esempio n. 1
0
def main():
    parser = argparse.ArgumentParser(
        description='Train using Gazebo Simulations')
    parser.add_argument('--seed', default=10, type=int, help='Random seed')
    parser.add_argument('--input_shape', default=(80, 100), help='Input shape')
    parser.add_argument('--gamma', default=0.99, help='Discount factor')
    parser.add_argument('--epsilon',
                        default=0.1,
                        help='Exploration probability in epsilon-greedy')
    parser.add_argument('--learning_rate',
                        default=0.00001,
                        help='learning rate')
    parser.add_argument('--window_size',
                        default=4,
                        type=int,
                        help='Number of frames to feed to the Q-network')
    parser.add_argument('--num_time',
                        default=4,
                        type=int,
                        help='Number of steps in RNN')
    parser.add_argument('--num_actions',
                        default=7,
                        type=int,
                        help='Number of actions')
    parser.add_argument('--batch_size',
                        default=64,
                        type=int,
                        help='Batch size of the training part')
    parser.add_argument('--num_iteration',
                        default=500000,
                        type=int,
                        help='number of iterations to train')
    parser.add_argument(
        '--eval_every',
        default=0.01,
        type=float,
        help='What fraction of num_iteration to run between evaluations')

    args = parser.parse_args()
    random.seed(args.seed)
    np.random.seed(args.seed)
    tf.set_random_seed(args.seed)

    batch_environment = GazeboWorld()
    print('Environment initialized')

    replay_memory = ReplayMemory(REPLAYMEMORY_SIZE, args.window_size,
                                 args.input_shape)
    online_model, online_params = create_model(args.window_size,
                                               args.input_shape,
                                               args.num_actions,
                                               'online_model',
                                               create_duel_q_network,
                                               trainable=True)
    target_model, target_params = create_model(args.window_size,
                                               args.input_shape,
                                               args.num_actions,
                                               'target_model',
                                               create_duel_q_network,
                                               trainable=False)
    update_target_params_ops = [
        t.assign(s) for s, t in zip(online_params, target_params)
    ]

    agent = DQNAgent(online_model, target_model, replay_memory,
                     args.num_actions, args.gamma, TARGET_UPDATE_FREQENCY,
                     update_target_params_ops, args.batch_size,
                     args.learning_rate)

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8)
    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    with sess.as_default():
        # saving and loading networks
        trainables = tf.trainable_variables()
        trainable_saver = tf.train.Saver(trainables, max_to_keep=1)
        sess.run(tf.global_variables_initializer())
        checkpoint = tf.train.get_checkpoint_state("saved_networks")
        print('checkpoint:', checkpoint)
        if checkpoint and checkpoint.model_checkpoint_path:
            trainable_saver.restore(sess, checkpoint.model_checkpoint_path)
            print("Successfully loaded:", checkpoint.model_checkpoint_path)
        else:
            print("Could not find old network weights")
        # make target_model equal to online_model
        sess.run(update_target_params_ops)

        print('Prepare fixed samples for mean max Q.')
        fixed_samples = get_fixed_samples(batch_environment, args.num_actions,
                                          NUM_FIXED_SAMPLES)

        # initialize replay buffer
        print('Burn in replay_memory.')
        agent.fit(sess, batch_environment, NUM_BURN_IN, do_train=False)

        # start training:
        fit_iteration = int(args.num_iteration * args.eval_every)
        for i in range(0, args.num_iteration, fit_iteration):
            # evaluate:
            reward_mean, reward_var, reward_max, reward_min, reward = agent.evaluate(
                sess, batch_environment)
            mean_max_Q1, mean_max_Q2 = agent.get_mean_max_Q(
                sess, fixed_samples)
            print("%d, %f, %f, %f, %f, %f, %f" %
                  (i, mean_max_Q1, mean_max_Q2, reward_mean, reward_var,
                   reward_max, reward_min))
            # train:
            agent.fit(sess, batch_environment, fit_iteration, do_train=True)
            trainable_saver.save(sess, 'saved_networks/', global_step=i)

        reward_mean, reward_var, reward_max, reward_min, reward = agent.evaluate(
            sess, batch_environment)
        mean_max_Q1, mean_max_Q2 = agent.get_mean_max_Q(sess, fixed_samples)
        print("%d, %f, %f, %f, %f, %f, %f" %
              (i, mean_max_Q1, mean_max_Q2, reward_mean, reward_var,
               reward_max, reward_min))
Esempio n. 2
0
def main():
    parser = argparse.ArgumentParser(
        description='Run DQN on Atari Space Invaders')
    parser.add_argument('--env',
                        default='SpaceInvaders-v0',
                        help='Atari env name')
    parser.add_argument('--seed', default=10703, type=int, help='Random seed')
    parser.add_argument('--input_shape', default=(84, 84), help='Input shape')
    parser.add_argument('--gamma', default=0.99, help='Discount factor')
    parser.add_argument('--epsilon',
                        default=0.1,
                        help='Exploration probability in epsilon-greedy')
    parser.add_argument('--learning_rate',
                        default=0.00025,
                        help='Training learning rate.')
    parser.add_argument('--window_size',
                        default=4,
                        type=int,
                        help='Number of frames to feed to the Q-network')
    parser.add_argument('--batch_size',
                        default=32,
                        type=int,
                        help='Batch size of the training part')
    parser.add_argument('--num_process',
                        default=3,
                        type=int,
                        help='Number of parallel environment')
    parser.add_argument('--num_iteration',
                        default=20000000,
                        type=int,
                        help='number of iterations to train')
    parser.add_argument(
        '--eval_every',
        default=0.001,
        type=float,
        help='What fraction of num_iteration to run between evaluations.')
    parser.add_argument('--is_duel',
                        default=1,
                        type=int,
                        help='Whether use duel DQN, 0 means no, 1 means yes.')
    parser.add_argument(
        '--is_double',
        default=1,
        type=int,
        help='Whether use double DQN, 0 means no, 1 means yes.')
    parser.add_argument(
        '--is_per',
        default=1,
        type=int,
        help='Whether use PriorityExperienceReplay, 0 means no, 1 means yes.')
    parser.add_argument(
        '--is_distributional',
        default=1,
        type=int,
        help='Whether use distributional DQN, 0 means no, 1 means yes.')
    parser.add_argument('--num_step',
                        default=1,
                        type=int,
                        help='Num Step for multi-step DQN, 3 is recommended')
    parser.add_argument('--is_noisy',
                        default=1,
                        type=int,
                        help='Whether use NoisyNet, 0 means no, 1 means yes.')

    args = parser.parse_args()
    args.input_shape = tuple(args.input_shape)
    print('Environment: %s.' % (args.env, ))
    env = gym.make(args.env)
    num_actions = env.action_space.n
    print('number_actions: %d.' % (num_actions, ))
    env.close()

    random.seed(args.seed)
    np.random.seed(args.seed)
    tf.set_random_seed(args.seed)

    batch_environment = BatchEnvironment(args.env, args.num_process,
                                         args.window_size, args.input_shape,
                                         NUM_FRAME_PER_ACTION,
                                         MAX_EPISODE_LENGTH)

    if args.is_per == 1:
        replay_memory = PriorityExperienceReplay(REPLAYMEMORY_SIZE,
                                                 args.window_size,
                                                 args.input_shape)
    else:
        replay_memory = ReplayMemory(REPLAYMEMORY_SIZE, args.window_size,
                                     args.input_shape)

    create_network_fn = create_deep_q_network if args.is_duel == 0 else create_duel_q_network
    create_model_fn = create_model if args.is_distributional == 0 else create_distributional_model
    noisy = True if args.is_noisy == 1 else False
    online_model, online_params = create_model_fn(args.window_size,
                                                  args.input_shape,
                                                  num_actions,
                                                  'online_model',
                                                  create_network_fn,
                                                  trainable=True,
                                                  noisy=noisy)
    target_model, target_params = create_model_fn(args.window_size,
                                                  args.input_shape,
                                                  num_actions,
                                                  'target_model',
                                                  create_network_fn,
                                                  trainable=False,
                                                  noisy=noisy)
    update_target_params_ops = [
        t.assign(s) for s, t in zip(online_params, target_params)
    ]

    agent = DQNAgent(online_model, target_model, replay_memory, num_actions,
                     args.gamma, UPDATE_FREQUENCY, TARGET_UPDATE_FREQENCY,
                     update_target_params_ops, args.batch_size, args.is_double,
                     args.is_per, args.is_distributional, args.num_step,
                     args.is_noisy, args.learning_rate, RMSP_DECAY,
                     RMSP_MOMENTUM, RMSP_EPSILON)

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.4)
    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    with sess.as_default():
        sess.run(tf.global_variables_initializer())
        # make target_model equal to online_model
        sess.run(update_target_params_ops)

        print('Prepare fixed samples for mean max Q.')
        fixed_samples = get_fixed_samples(batch_environment, num_actions,
                                          NUM_FIXED_SAMPLES)

        print('Burn in replay_memory.')
        agent.fit(sess, batch_environment, NUM_BURN_IN, do_train=False)

        # Begin to train:
        fit_iteration = int(args.num_iteration * args.eval_every)

        for i in range(0, args.num_iteration, fit_iteration):
            # Evaluate:
            reward_mean, reward_var = agent.evaluate(sess, batch_environment,
                                                     NUM_EVALUATE_EPSIODE)
            mean_max_Q = agent.get_mean_max_Q(sess, fixed_samples)
            print("%d, %f, %f, %f" % (i, mean_max_Q, reward_mean, reward_var))
            # Train:
            agent.fit(sess, batch_environment, fit_iteration, do_train=True)

    batch_environment.close()