Example #1
0
def main():

    print("Creating model...")
    model = create_model()
    model.summary()

    print("Creating environment...")
    environment = gym.make("CartPole-v0")
    environment._max_episode_steps = 500

    print("Creating agent...")
    if agent_type == "dqn":
        agent = DQNAgent(name="cartpole-dqn",
                         model=model,
                         environment=environment,
                         observation_frames=1,
                         observation_transformation=observation_transformation,
                         reward_transformation=reward_transformation,
                         gamma=0.95,
                         final_epsilon=0.01,
                         initial_epsilon=1.0,
                         number_of_iterations=1000000,
                         replay_memory_size=2000,
                         minibatch_size=32)
    elif agent_type == "ddqn":
        agent = DDQNAgent(
            name="cartpole-ddqn",
            model=model,
            environment=environment,
            observation_frames=1,
            observation_transformation=observation_transformation,
            reward_transformation=reward_transformation,
            gamma=0.95,
            final_epsilon=0.01,
            initial_epsilon=1.0,
            number_of_iterations=1000000,
            replay_memory_size=2000,
            minibatch_size=32,
            model_copy_interval=100)
    agent.enable_rewards_tracking(rewards_running_means_length=10000)
    agent.enable_episodes_tracking(episodes_running_means_length=10000)
    agent.enable_maxq_tracking(maxq_running_means_length=10000)
    agent.enable_model_saving(model_save_frequency=100000)
    agent.enable_tensorboard_for_tracking()

    print("Training ...")
    agent.fit(verbose=True, headless="render" not in sys.argv)
Example #2
0
def main():

    print("Creating environment...")
    environment = gym_tetris.make('Tetris-v0')

    print("Creating model...")
    model = modelutils.create_model(number_of_actions)
    model.summary()

    print("Creating agent...")
    if agent_type == "dqn":
        agent = DQNAgent(
            name="tetris-dqn",
            environment=environment,
            model=model,
            observation_transformation=utils.resize_and_bgr2gray,
            observation_frames=4,
            number_of_iterations=1000000,
            gamma=0.95,
            final_epsilon=0.01,
            initial_epsilon=1.0,
            replay_memory_size=2000,
            minibatch_size=32
        )
    elif agent_type == "ddqn":
        agent = DDQNAgent(
            name="tetris-ddqn",
            environment=environment,
            model=model,
            observation_transformation=utils.resize_and_bgr2gray,
            observation_frames=4,
            number_of_iterations=1000000,
            gamma=0.95,
            final_epsilon=0.01,
            initial_epsilon=1.0,
            replay_memory_size=2000,
            minibatch_size=32,
            model_copy_interval=100
        )
    agent.enable_rewards_tracking(rewards_running_means_length=10000)
    agent.enable_episodes_tracking(episodes_running_means_length=100)
    agent.enable_maxq_tracking(maxq_running_means_length=10000)
    agent.enable_model_saving(model_save_frequency=10000)
    agent.enable_plots_saving(plots_save_frequency=10000)

    print("Training ...")
    agent.fit(verbose=True, headless="headless" in sys.argv, render_states=True)
Example #3
0
def main():  # noqa: D103
    parser = argparse.ArgumentParser(description="Run DQN on iLOCuS")
    parser.add_argument("--network_name",
                        default="deep_q_network",
                        type=str,
                        help="Type of model to use")
    parser.add_argument("--batch_size",
                        default=32,
                        type=int,
                        help="Batch size")
    parser.add_argument("--map_shape",
                        default=(15, 15),
                        type=tuple,
                        help="map size")
    parser.add_argument("--num_actions",
                        default=4,
                        type=int,
                        help="level of pricing")

    parser.add_argument("--gamma",
                        default=0.8,
                        type=float,
                        help="Discount factor")
    parser.add_argument("--alpha",
                        default=0.0001,
                        type=float,
                        help="Learning rate")
    parser.add_argument("--epsilon",
                        default=0.5,
                        type=float,
                        help="Exploration probability for epsilon-greedy")
    parser.add_argument("--target_update_freq",
                        default=10000,
                        type=int,
                        help="Frequency for copying weights to target network")
    parser.add_argument(
        "--num_iterations",
        default=5000000,
        type=int,
        help="Number of overal interactions to the environment")
    parser.add_argument("--max_episode_length",
                        default=200000,
                        type=int,
                        help="Terminate earlier for one episode")
    parser.add_argument("--train_freq",
                        default=4,
                        type=int,
                        help="Frequency for training")
    parser.add_argument("--num-burn-in",
                        default=10000,
                        type=int,
                        help="number of memory before train")

    parser.add_argument("-o",
                        "--output",
                        default="ilocus-v0",
                        type=str,
                        help="Directory to save data to")
    parser.add_argument("--seed", default=0, type=int, help="Random seed")
    parser.add_argument("--train",
                        default=True,
                        type=bool,
                        help="Train/Evaluate, set True if train the model")
    parser.add_argument("--model_path",
                        default="atari-v0",
                        type=str,
                        help="specify model path to evaluation")
    parser.add_argument("--max_grad",
                        default=1.0,
                        type=float,
                        help="Parameter for huber loss")
    parser.add_argument("--log_dir",
                        default="log",
                        type=str,
                        help="specify log folder to save evaluate result")
    parser.add_argument(
        "--flip_coin",
        default=False,
        type=str,
        help="specify whether or not choosing double q learning")
    parser.add_argument("--eval_num",
                        default=100,
                        type=int,
                        help="number of evaluation to run")
    parser.add_argument("--save_freq",
                        default=100000,
                        type=int,
                        help="model save frequency")

    # memory related args
    parser.add_argument("--buffer_size",
                        default=100000,
                        type=int,
                        help="reply memory buffer size")
    parser.add_argument(
        "--look_back_steps",
        default=4,
        type=int,
        help="how many previous pricing tables will be fed into RL")

    args = parser.parse_args()
    print("\nParameters:")
    for arg in vars(args):
        print(arg, getattr(args, arg))

    # Initiating policy for both tasks (training and evaluating)
    policy = LinearDecayGreedyEpsilonPolicy(args.epsilon, 0.1, 1000000,
                                            args.num_actions)

    if not args.train:
        '''Evaluate the model'''
        # check model path
        if args.model_path is '':
            print("Model path must be set when evaluate")
            exit(1)

        # specific log file to save result
        log_file = os.path.join(args.log_dir, args.network_name,
                                str(args.model_num))
        model_dir = os.path.join(args.model_path, args.network_name,
                                 str(args.model_num))

        with tf.Session() as sess:
            # load model
            # with open(model_dir + ".json", 'r') as json_file:
            #     loaded_model_json = json_file.read()
            #     q_network_online = model_from_json(loaded_model_json)
            #     q_network_target = model_from_json(loaded_model_json)
            #
            # sess.run(tf.global_variables_initializer())
            #
            # # load weights into model
            # q_network_online.load_weights(model_dir + ".h5")
            # q_network_target.load_weights(model_dir + ".h5")

            driver_sim = DriverSim()
            env = Environment(driver_sim=driver_sim)

            memory = ReplayMemory(args.buffer_size, args.look_back_steps)
            q_network = create_model(args.look_back_steps, args.map_shape,
                                     args.num_actions)
            dqn_agent = DQNAgent(q_network=q_network,
                                 memory=memory,
                                 policy=policy,
                                 gamma=args.gamma,
                                 target_update_freq=args.target_update_freq,
                                 num_burn_in=args.num_burn_in,
                                 train_freq=args.train_freq,
                                 batch_size=args.batch_size)
        exit(0)
    '''Train the model'''

    with tf.Session() as sess:
        # with tf.device('/cpu:0'):
        print("created model")

        driver_sim = DriverSim()
        env = Environment(driver_sim=driver_sim)
        print("set up environment")

        # # create output dir, meant to pop up error when dir exist to avoid over written
        # os.mkdir(args.output + "/" + args.network_name)

        memory = ReplayMemory(args.buffer_size, args.look_back_steps)
        q_network = create_model(args.look_back_steps, args.map_shape,
                                 args.num_actions)
        dqn_agent = DQNAgent(q_network=q_network,
                             memory=memory,
                             policy=policy,
                             gamma=args.gamma,
                             target_update_freq=args.target_update_freq,
                             num_burn_in=args.num_burn_in,
                             train_freq=args.train_freq,
                             batch_size=args.batch_size)
        print("defined dqn agent")

        optimizer = Adam(learning_rate=args.alpha)
        q_network.compile(optimizer, mean_huber_loss)

        sess.run(tf.global_variables_initializer())

        print("initializing environment")
        env.reset()

        print("in fit")
        if os.path.exists(args.output):
            shutil.rmtree(args.output)
        os.mkdir(args.output)
        dqn_agent.fit(env=env,
                      num_iterations=args.num_iterations,
                      output_dir=os.path.join(args.output),
                      max_episode_length=args.max_episode_length)
Example #4
0
                      load_weights_file=args.weights)
memory = ReplayMemory(maxlen=1000000)
processor = AtariProcessor()

if (args.test):
    policy = MaxQPolicy()
    dqn = DQNAgent(env=env,
                   memory=memory,
                   policy=policy,
                   model=model,
                   discount_rate=0.99,
                   processor=processor)
    dqn.play()
else:
    policy = EpsilonPolicy(epsilon_max=1.0,
                           epsilon_min=0.1,
                           decay_steps=1250000)
    dqn = DQNAgent(env=env,
                   memory=memory,
                   policy=policy,
                   batch_size=32,
                   model=model,
                   discount_rate=0.99,
                   processor=processor,
                   weights_filename='./pacman.h5')
    dqn.fit(num_steps=4000000,
            start_train=1000,
            learn_every=4,
            update_target_model=5000,
            save_every=1000)
Example #5
0
def main():
    parser = argparse.ArgumentParser(
        description='Train using Gazebo Simulations')
    parser.add_argument('--seed', default=10, type=int, help='Random seed')
    parser.add_argument('--input_shape', default=(80, 100), help='Input shape')
    parser.add_argument('--gamma', default=0.99, help='Discount factor')
    parser.add_argument('--epsilon',
                        default=0.1,
                        help='Exploration probability in epsilon-greedy')
    parser.add_argument('--learning_rate',
                        default=0.00001,
                        help='learning rate')
    parser.add_argument('--window_size',
                        default=4,
                        type=int,
                        help='Number of frames to feed to the Q-network')
    parser.add_argument('--num_time',
                        default=4,
                        type=int,
                        help='Number of steps in RNN')
    parser.add_argument('--num_actions',
                        default=7,
                        type=int,
                        help='Number of actions')
    parser.add_argument('--batch_size',
                        default=64,
                        type=int,
                        help='Batch size of the training part')
    parser.add_argument('--num_iteration',
                        default=500000,
                        type=int,
                        help='number of iterations to train')
    parser.add_argument(
        '--eval_every',
        default=0.01,
        type=float,
        help='What fraction of num_iteration to run between evaluations')

    args = parser.parse_args()
    random.seed(args.seed)
    np.random.seed(args.seed)
    tf.set_random_seed(args.seed)

    batch_environment = GazeboWorld()
    print('Environment initialized')

    replay_memory = ReplayMemory(REPLAYMEMORY_SIZE, args.window_size,
                                 args.input_shape)
    online_model, online_params = create_model(args.window_size,
                                               args.input_shape,
                                               args.num_actions,
                                               'online_model',
                                               create_duel_q_network,
                                               trainable=True)
    target_model, target_params = create_model(args.window_size,
                                               args.input_shape,
                                               args.num_actions,
                                               'target_model',
                                               create_duel_q_network,
                                               trainable=False)
    update_target_params_ops = [
        t.assign(s) for s, t in zip(online_params, target_params)
    ]

    agent = DQNAgent(online_model, target_model, replay_memory,
                     args.num_actions, args.gamma, TARGET_UPDATE_FREQENCY,
                     update_target_params_ops, args.batch_size,
                     args.learning_rate)

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8)
    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    with sess.as_default():
        # saving and loading networks
        trainables = tf.trainable_variables()
        trainable_saver = tf.train.Saver(trainables, max_to_keep=1)
        sess.run(tf.global_variables_initializer())
        checkpoint = tf.train.get_checkpoint_state("saved_networks")
        print('checkpoint:', checkpoint)
        if checkpoint and checkpoint.model_checkpoint_path:
            trainable_saver.restore(sess, checkpoint.model_checkpoint_path)
            print("Successfully loaded:", checkpoint.model_checkpoint_path)
        else:
            print("Could not find old network weights")
        # make target_model equal to online_model
        sess.run(update_target_params_ops)

        print('Prepare fixed samples for mean max Q.')
        fixed_samples = get_fixed_samples(batch_environment, args.num_actions,
                                          NUM_FIXED_SAMPLES)

        # initialize replay buffer
        print('Burn in replay_memory.')
        agent.fit(sess, batch_environment, NUM_BURN_IN, do_train=False)

        # start training:
        fit_iteration = int(args.num_iteration * args.eval_every)
        for i in range(0, args.num_iteration, fit_iteration):
            # evaluate:
            reward_mean, reward_var, reward_max, reward_min, reward = agent.evaluate(
                sess, batch_environment)
            mean_max_Q1, mean_max_Q2 = agent.get_mean_max_Q(
                sess, fixed_samples)
            print("%d, %f, %f, %f, %f, %f, %f" %
                  (i, mean_max_Q1, mean_max_Q2, reward_mean, reward_var,
                   reward_max, reward_min))
            # train:
            agent.fit(sess, batch_environment, fit_iteration, do_train=True)
            trainable_saver.save(sess, 'saved_networks/', global_step=i)

        reward_mean, reward_var, reward_max, reward_min, reward = agent.evaluate(
            sess, batch_environment)
        mean_max_Q1, mean_max_Q2 = agent.get_mean_max_Q(sess, fixed_samples)
        print("%d, %f, %f, %f, %f, %f, %f" %
              (i, mean_max_Q1, mean_max_Q2, reward_mean, reward_var,
               reward_max, reward_min))
Example #6
0
def main():
    parser = argparse.ArgumentParser(
        description='Run DQN on Atari Space Invaders')
    parser.add_argument('--env',
                        default='SpaceInvaders-v0',
                        help='Atari env name')
    parser.add_argument('--seed', default=10703, type=int, help='Random seed')
    parser.add_argument('--input_shape', default=(84, 84), help='Input shape')
    parser.add_argument('--gamma', default=0.99, help='Discount factor')
    parser.add_argument('--epsilon',
                        default=0.1,
                        help='Exploration probability in epsilon-greedy')
    parser.add_argument('--learning_rate',
                        default=0.00025,
                        help='Training learning rate.')
    parser.add_argument('--window_size',
                        default=4,
                        type=int,
                        help='Number of frames to feed to the Q-network')
    parser.add_argument('--batch_size',
                        default=32,
                        type=int,
                        help='Batch size of the training part')
    parser.add_argument('--num_process',
                        default=3,
                        type=int,
                        help='Number of parallel environment')
    parser.add_argument('--num_iteration',
                        default=20000000,
                        type=int,
                        help='number of iterations to train')
    parser.add_argument(
        '--eval_every',
        default=0.001,
        type=float,
        help='What fraction of num_iteration to run between evaluations.')
    parser.add_argument('--is_duel',
                        default=1,
                        type=int,
                        help='Whether use duel DQN, 0 means no, 1 means yes.')
    parser.add_argument(
        '--is_double',
        default=1,
        type=int,
        help='Whether use double DQN, 0 means no, 1 means yes.')
    parser.add_argument(
        '--is_per',
        default=1,
        type=int,
        help='Whether use PriorityExperienceReplay, 0 means no, 1 means yes.')
    parser.add_argument(
        '--is_distributional',
        default=1,
        type=int,
        help='Whether use distributional DQN, 0 means no, 1 means yes.')
    parser.add_argument('--num_step',
                        default=1,
                        type=int,
                        help='Num Step for multi-step DQN, 3 is recommended')
    parser.add_argument('--is_noisy',
                        default=1,
                        type=int,
                        help='Whether use NoisyNet, 0 means no, 1 means yes.')

    args = parser.parse_args()
    args.input_shape = tuple(args.input_shape)
    print('Environment: %s.' % (args.env, ))
    env = gym.make(args.env)
    num_actions = env.action_space.n
    print('number_actions: %d.' % (num_actions, ))
    env.close()

    random.seed(args.seed)
    np.random.seed(args.seed)
    tf.set_random_seed(args.seed)

    batch_environment = BatchEnvironment(args.env, args.num_process,
                                         args.window_size, args.input_shape,
                                         NUM_FRAME_PER_ACTION,
                                         MAX_EPISODE_LENGTH)

    if args.is_per == 1:
        replay_memory = PriorityExperienceReplay(REPLAYMEMORY_SIZE,
                                                 args.window_size,
                                                 args.input_shape)
    else:
        replay_memory = ReplayMemory(REPLAYMEMORY_SIZE, args.window_size,
                                     args.input_shape)

    create_network_fn = create_deep_q_network if args.is_duel == 0 else create_duel_q_network
    create_model_fn = create_model if args.is_distributional == 0 else create_distributional_model
    noisy = True if args.is_noisy == 1 else False
    online_model, online_params = create_model_fn(args.window_size,
                                                  args.input_shape,
                                                  num_actions,
                                                  'online_model',
                                                  create_network_fn,
                                                  trainable=True,
                                                  noisy=noisy)
    target_model, target_params = create_model_fn(args.window_size,
                                                  args.input_shape,
                                                  num_actions,
                                                  'target_model',
                                                  create_network_fn,
                                                  trainable=False,
                                                  noisy=noisy)
    update_target_params_ops = [
        t.assign(s) for s, t in zip(online_params, target_params)
    ]

    agent = DQNAgent(online_model, target_model, replay_memory, num_actions,
                     args.gamma, UPDATE_FREQUENCY, TARGET_UPDATE_FREQENCY,
                     update_target_params_ops, args.batch_size, args.is_double,
                     args.is_per, args.is_distributional, args.num_step,
                     args.is_noisy, args.learning_rate, RMSP_DECAY,
                     RMSP_MOMENTUM, RMSP_EPSILON)

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.4)
    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    with sess.as_default():
        sess.run(tf.global_variables_initializer())
        # make target_model equal to online_model
        sess.run(update_target_params_ops)

        print('Prepare fixed samples for mean max Q.')
        fixed_samples = get_fixed_samples(batch_environment, num_actions,
                                          NUM_FIXED_SAMPLES)

        print('Burn in replay_memory.')
        agent.fit(sess, batch_environment, NUM_BURN_IN, do_train=False)

        # Begin to train:
        fit_iteration = int(args.num_iteration * args.eval_every)

        for i in range(0, args.num_iteration, fit_iteration):
            # Evaluate:
            reward_mean, reward_var = agent.evaluate(sess, batch_environment,
                                                     NUM_EVALUATE_EPSIODE)
            mean_max_Q = agent.get_mean_max_Q(sess, fixed_samples)
            print("%d, %f, %f, %f" % (i, mean_max_Q, reward_mean, reward_var))
            # Train:
            agent.fit(sess, batch_environment, fit_iteration, do_train=True)

    batch_environment.close()
Example #7
0
    env.seed(123)
    nb_actions = env.action_space.n

    model = DQNModel(nb_actions=nb_actions).model
    policy = EpsGreedyPolicy(eps_min=0.1,
                             eps_max=1,
                             eps_test=0.05,
                             nb_steps=1000000)
    memory = Memory(max_len=1000000)
    processor = AtariProcessor()
    dqn = DQNAgent(env,
                   model,
                   policy,
                   memory,
                   processor,
                   gamma=0.99,
                   batch_size=32,
                   target_model_update_steps=10000,
                   nb_episodes_warmup=500)

    dqn.fit(nb_episodes=20000,
            action_repetition=1,
            save_weights=True,
            save_weights_step=1000,
            weights_folder='./',
            visualize=True)

    # file = './weights.h5f'
    # dqn.load_weights(file)
    dqn.test(nb_episodes=10, visualize=True)
Example #8
0
memory = ReplayMemory(maxlen=1000, game_over_bias=5)
processor = VoidProcessor()

if (args.test):
    policy = MaxQPolicy()
    dqn = DQNAgent(env=env,
                   memory=memory,
                   policy=policy,
                   model=model,
                   discount_rate=0.99,
                   processor=processor)
    dqn.play()
else:
    policy = EpsilonPolicy(epsilon_max=1.0,
                           epsilon_min=0.05,
                           decay_steps=10000)
    dqn = DQNAgent(env=env,
                   memory=memory,
                   policy=policy,
                   batch_size=64,
                   model=model,
                   discount_rate=0.99,
                   processor=processor,
                   weights_filename='./cartpole.h5')
    dqn.fit(num_steps=20000,
            start_train=1000,
            learn_every=1,
            update_target_model=100,
            save_every=1000,
            max_episode_score=500)