p_phys=all_configs["p_phys"],
    p_meas=all_configs["p_meas"],
    error_model=all_configs["error_model"],
    use_Y=all_configs["use_Y"],
    volume_depth=all_configs["volume_depth"],
    static_decoder=static_decoder)
# -------------------------------------------------------------------------------------------

model = build_convolutional_nn(all_configs["c_layers"],
                               all_configs["ff_layers"],
                               env.observation_space.shape, env.num_actions)
memory = SequentialMemory(limit=all_configs["buffer_size"], window_length=1)
policy = LinearAnnealedPolicy(
    EpsGreedyQPolicy(masked_greedy=all_configs["masked_greedy"]),
    attr='eps',
    value_max=all_configs["max_eps"],
    value_min=all_configs["final_eps"],
    value_test=0.0,
    nb_steps=all_configs["exploration_fraction"])
test_policy = GreedyQPolicy(masked_greedy=True)

# ------------------------------------------------------------------------------------------

dqn = DQNAgent(model=model,
               nb_actions=env.num_actions,
               memory=memory,
               nb_steps_warmup=all_configs["learning_starts"],
               target_model_update=all_configs["target_network_update_freq"],
               policy=policy,
               test_policy=test_policy,
               gamma=all_configs["gamma"],
Beispiel #2
0
model = Sequential()
model.add(Flatten(input_shape=(WINDOW_LENGTH, 128)))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(nb_actions, activation='linear'))
model.summary()

NUM_STEPS = 10000

memory = SequentialMemory(limit=round(0.75 * NUM_STEPS),
                          window_length=WINDOW_LENGTH)
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                              attr='eps',
                              value_max=1.,
                              value_min=.1,
                              value_test=0.05,
                              nb_steps=round(0.8 * NUM_STEPS))
test_policy = EpsGreedyQPolicy(eps=0.05)
dqn = DQNAgent(model=model,
               nb_actions=nb_actions,
               memory=memory,
               nb_steps_warmup=100,
               processor=ScoreProcessor(),
               target_model_update=1e-2,
               policy=policy,
               test_policy=test_policy)
dqn.compile(Adam(lr=5e-4), metrics=['mae'])

tensorboard = create_logger(ENV_NAME)
checkpoint = create_model_checkpoint(ENV_NAME,
Beispiel #3
0
def main(params=None):
    """
    performs training and evaluation of params
    :return: model
    """
    if params is None:
        params = {
            'model_type': 'dqn_agent',
            'l1_out': 128,
            'l2_out': 64,
            'gamma': 0.5,
            'target_model_update': 1,
            'delta_clip': 0.01,
            'nb_steps_warmup': 1000
        }

    model_type = 'dqn_agent'
    env_player = SimpleRLPlayer(battle_format="gen8randombattle")
    # print('env_player',env_player)
    # print('help', help(env_player))
    env_player2 = SimpleRLPlayer(battle_format="gen8randombattle")

    opponent = RandomPlayer(battle_format="gen8randombattle")
    second_opponent = MaxDamagePlayer(battle_format="gen8randombattle")

    # Output dimension
    n_action = len(env_player.action_space)

    # model_params = {
    #     'n_actions': n_action,
    #     'l1_out': 128,
    #     'l2_out': 64,
    #     'model_type': params['model_type']
    # }
    model_params = params
    model_params['n_actions'] = n_action

    model = get_model(model_params)

    # print('first model summary')
    # print(model.summary())
    # model = Sequential()
    # model.add(Dense(128, activation="elu", input_shape=(1, 10)))
    #
    # # Our embedding have shape (1, 10), which affects our hidden layer
    # # dimension and output dimension
    # # Flattening resolve potential issues that would arise otherwise
    # model.add(Flatten())
    # model.add(Dense(64, activation="elu"))
    # model.add(Dense(n_action, activation="linear"))

    # elu activation is similar to relu
    # https://ml-cheatsheet.readthedocs.io/en/latest/activation_functions.html#elu

    # determine memory type
    if params['model_type'] in {'dqn_agent', 'sarsa_agent'}:
        # memory = SequentialMemory(limit=10000, window_length=1)
        memory = SequentialMemory(limit=NB_TRAINING_STEPS, window_length=1)
    else:
        memory = EpisodeParameterMemory(limit=10000, window_length=1)

    # Simple epsilon greedy
    # What is linear annealed policy?
    # - this policy gives gradually decreasing thresholds for the epsilon greedy policy
    # - it acts as a wrapper around epsilon greedy to feed in a custom threshold
    pol_steps = NB_TRAINING_STEPS
    policy = LinearAnnealedPolicy(
        EpsGreedyQPolicy(),
        attr="eps",
        value_max=1.0,
        value_min=0.05,
        value_test=0,
        nb_steps=pol_steps,
    )
    # pol_steps = NB_TRAINING_STEPS
    policy_boltz = BoltzmannQPolicy(tau=1)
    # policy = LinearAnnealedPolicy(
    #     BoltzmannQPolicy(),
    #     attr="tau",
    #     value_max=1.0,
    #     value_min=0.05,
    #     value_test=0,
    #     nb_steps=pol_steps,
    # )
    policy = policy_boltz

    # Defining our DQN
    # model = tf.keras.models.load_model('dqn_v_dqn')

    if params['model_type'] == 'dqn_agent':
        dqn = DQNAgent(
            model=model,
            nb_actions=len(env_player.action_space),
            policy=policy,
            memory=memory,
            nb_steps_warmup=params['nb_steps_warmup'],
            gamma=params['gamma'],
            target_model_update=params['target_model_update'],
            # delta_clip=0.01,
            delta_clip=params['delta_clip'],
            enable_double_dqn=params['enable_double_dqn__'],
            enable_dueling_network=params['enable_double_dqn__'],
            dueling_type=params['dueling_type__'])
        dqn.compile(Adam(lr=0.00025), metrics=["mae"])

    elif params['model_type'] == 'sarsa_agent':
        dqn = SARSAAgent(model=model,
                         nb_actions=len(env_player.action_space),
                         policy=policy,
                         nb_steps_warmup=params['nb_steps_warmup'],
                         gamma=params['gamma'],
                         delta_clip=params['delta_clip'])
        dqn.compile(Adam(lr=0.00025), metrics=["mae"])
    else:
        # CEMAgent
        # https://towardsdatascience.com/cross-entropy-method-for-reinforcement-learning-2b6de2a4f3a0
        dqn = CEMAgent(model=model,
                       nb_actions=len(env_player.action_space),
                       memory=memory,
                       nb_steps_warmup=params['nb_steps_warmup'])
        # different compile function
        dqn.compile()

    # dqn.compile(Adam(lr=0.00025), metrics=["mae"])
    # opponent dqn
    dqn_opponent = DQNAgent(
        model=model,
        nb_actions=len(env_player.action_space),
        policy=policy,
        memory=memory,
        nb_steps_warmup=params['nb_steps_warmup'],
        gamma=params['gamma'],
        target_model_update=params['target_model_update'],
        # delta_clip=0.01,
        delta_clip=params['delta_clip'],
        enable_double_dqn=params['enable_double_dqn__'],
        enable_dueling_network=params['enable_double_dqn__'],
        dueling_type=params['dueling_type__'])
    dqn_opponent.compile(Adam(lr=0.00025), metrics=["mae"])
    # NB_TRAINING_STEPS = NB_TRAINING_STEPS

    # rl_opponent = TrainedRLPlayer(model)
    # Training
    rounds = 4
    n_steps = NB_TRAINING_STEPS // rounds

    for k in range(rounds):
        env_player.play_against(
            env_algorithm=dqn_training,
            opponent=opponent,
            env_algorithm_kwargs={
                "dqn": dqn,
                "nb_steps": n_steps
            },
        )
        env_player.play_against(
            env_algorithm=dqn_training,
            opponent=second_opponent,
            env_algorithm_kwargs={
                "dqn": dqn,
                "nb_steps": n_steps
            },
        )

    name = params["name"] + "_model"
    model.save(name)

    # loaded_model = tf.keras.models.load_model(name)

    # Evaluation
    print("Results against random player:")
    env_player.play_against(
        env_algorithm=dqn_evaluation,
        opponent=opponent,
        env_algorithm_kwargs={
            "dqn": dqn,
            "nb_episodes": NB_EVALUATION_EPISODES
        },
    )

    print("\nResults against max player:")
    env_player.play_against(
        env_algorithm=dqn_evaluation,
        opponent=second_opponent,
        env_algorithm_kwargs={
            "dqn": dqn,
            "nb_episodes": NB_EVALUATION_EPISODES
        },
    )

    return model
Beispiel #4
0
#no duel
model = Sequential()
model.add(Flatten(input_shape=(1, ) + env.observation_space.shape))
model.add(Dense(20))
model.add(Activation('relu'))
model.add(Dense(20))
model.add(Activation('relu'))
model.add(Dense(20))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                              attr='eps',
                              value_max=1.,
                              value_min=.000002,
                              value_test=.05,
                              nb_steps=500000)
noDuelAgent = DQNAgent(model=model,
                       nb_actions=nb_actions,
                       memory=memory,
                       target_model_update=1e-2,
                       policy=policy)
noDuelAgent.compile(Adam(lr=1e-3), metrics=['mae'])
hist = noDuelAgent.fit(env,
                       nb_max_episode_steps=10000,
                       visualize=False,
                       verbose=2,
                       nb_steps=500000)

reward_his_noDuel = hist.history.get('episode_reward')
def training_game():
    env = Environment(
        map_name="HallucinIce",
        visualize=True,
        game_steps_per_episode=150,
        agent_interface_format=features.AgentInterfaceFormat(
            feature_dimensions=features.Dimensions(screen=64, minimap=32)))

    input_shape = (_SIZE, _SIZE, 1)
    nb_actions = 12  # Number of actions

    model = neural_network_model(input_shape, nb_actions)
    memory = SequentialMemory(limit=5000, window_length=_WINDOW_LENGTH)

    processor = SC2Proc()

    # Policy

    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                                  attr="eps",
                                  value_max=1,
                                  value_min=0.7,
                                  value_test=.0,
                                  nb_steps=1e6)

    # Agent

    dqn = DQNAgent(model=model,
                   nb_actions=nb_actions,
                   memory=memory,
                   enable_double_dqn=False,
                   nb_steps_warmup=500,
                   target_model_update=1e-2,
                   policy=policy,
                   batch_size=150,
                   processor=processor)

    dqn.compile(Adam(lr=.001), metrics=["mae"])

    # Save the parameters and upload them when needed

    name = "HallucinIce"
    w_file = "dqn_{}_weights.h5f".format(name)
    check_w_file = "train_w" + name + "_weights.h5f"

    if SAVE_MODEL:
        check_w_file = "train_w" + name + "_weights_{step}.h5f"

    log_file = "training_w_{}_log.json".format(name)
    callbacks = [ModelIntervalCheckpoint(check_w_file, interval=1000)]
    callbacks += [FileLogger(log_file, interval=100)]

    if LOAD_MODEL:
        dqn.load_weights(w_file)

    dqn.fit(env,
            callbacks=callbacks,
            nb_steps=1e7,
            action_repetition=2,
            log_interval=1e4,
            verbose=2)

    dqn.save_weights(w_file, overwrite=True)
    dqn.test(env, action_repetition=2, nb_episodes=30, visualize=False)
def train(index, policy_nb_steps, fit_nb_steps):

    # Get the environment and extract the number of actions.
    print("Using environment", environment_name)
    environment = gym.make(environment_name)
    environment = CarRacingDiscreteWrapper(environment)
    np.random.seed(666)
    nb_actions = environment.action_space.n

    # Build the model.
    model = build_model((WINDOW_LENGTH, ) + INPUT_SHAPE, nb_actions)
    print(model.summary())

    # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
    # even the metrics!
    memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)
    processor = CarRacingProcessor()

    # Select a policy. We use eps-greedy action selection, which means that a random action is selected
    # with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that
    # the agent initially explores the environment (high eps) and then gradually sticks to what it knows
    # (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05
    # so that the agent still performs some random actions. This ensures that the agent cannot get stuck.
    policy = LinearAnnealedPolicy(
        EpsGreedyQPolicy(),
        attr='eps',
        value_max=1.,
        value_min=.1,
        value_test=.05,
        #nb_steps=1000000
        nb_steps=policy_nb_steps)

    # The trade-off between exploration and exploitation is difficult and an on-going research topic.
    # If you want, you can experiment with the parameters or use a different policy. Another popular one
    # is Boltzmann-style exploration:
    # policy = BoltzmannQPolicy(tau=1.)
    # Feel free to give it a try!

    dqn = DQNAgent(model=model,
                   nb_actions=nb_actions,
                   policy=policy,
                   memory=memory,
                   processor=processor,
                   nb_steps_warmup=50000,
                   gamma=.99,
                   target_model_update=10000,
                   train_interval=4,
                   delta_clip=1.)
    dqn.compile(optimizers.Adam(lr=.00025), metrics=['mae'])

    weights_filename = 'dqn_{}_{}_weights.h5f'.format(environment_name, index)

    # Okay, now it's time to learn something! We capture the interrupt exception so that training
    # can be prematurely aborted. Notice that now you can use the built-in Keras callbacks!
    checkpoint_weights_filename = 'dqn_' + environment_name + '_weights_{step}.h5f'
    log_filename = 'dqn_{}_log.json'.format(environment_name)
    callbacks = [
        ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000)
    ]
    callbacks += [TensorboardCallback()]
    callbacks += [FileLogger(log_filename, interval=100)]
    dqn.fit(
        environment,
        callbacks=callbacks,
        #nb_steps=1750000,
        nb_steps=fit_nb_steps,
        log_interval=10000,
        visualize="visualize" in sys.argv)

    # After training is done, we save the final weights one more time.
    dqn.save_weights(weights_filename, overwrite=True)
model.add(Dense(nb_actions, activation="linear"))
print(model.summary())

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)
processor = AtariProcessor()

# Select a policy. We use eps-greedy action selection, which means that a random action is selected
# with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that
# the agent initially explores the environment (high eps) and then gradually sticks to what it knows
# (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05
# so that the agent still performs some random actions. This ensures that the agent cannot get stuck.
policy = LinearAnnealedPolicy(MaxBoltzmannQPolicy(),
                              attr='eps',
                              value_max=1.,
                              value_min=.05,
                              value_test=.001,
                              nb_steps=1000000)

# The trade-off between exploration and exploitation is difficult and an on-going research topic.
# If you want, you can experiment with the parameters or use a different policy. Another popular one
# is Boltzmann-style exploration:
# policy = BoltzmannQPolicy(tau=1.)
# Feel free to give it a try!
# policy = LinearAnnealedPolicy(BoltzmannQPolicy(), attr='tau', value_max=10., value_min=.1, value_test=.05, nb_steps=1000000)

dqn = DQNAgent(model=model,
               nb_actions=nb_actions,
               policy=policy,
               memory=memory,
               processor=processor,
Beispiel #8
0
def main(shape=10,
         winsize=4,
         test=False,
         num_max_test=200,
         visualize_training=False,
         start_steps=0,
         randseed=None,
         human_mode_sleep=0.02):
    INPUT_SHAPE = (shape, shape)
    WINDOW_LENGTH = winsize

    class SnakeProcessor(Processor):
        def process_observation(self, observation):
            # assert observation.ndim == 1, str(observation.shape)  # (height, width, channel)
            assert observation.shape == INPUT_SHAPE
            return observation.astype(
                'uint8')  # saves storage in experience memory

        def process_state_batch(self, batch):
            # We could perform this processing step in `process_observation`. In this case, however,
            # we would need to store a `float32` array instead, which is 4x more memory intensive than
            # an `uint8` array. This matters if we store 1M observations.
            processed_batch = batch.astype('float32') / 255.
            return processed_batch

        def process_reward(self, reward):
            return reward

    try:
        randseed = int(randseed)
        print(f"set seed to {randseed}")
    except Exception:
        print(f"failed to intify seed of {randseed}, making it None")
        randseed = None

    env = gym.make('snakenv-v0',
                   gs=shape,
                   seed=randseed,
                   human_mode_sleep=human_mode_sleep)
    np.random.seed(123)
    env.seed(123)

    input_shape = (WINDOW_LENGTH, ) + INPUT_SHAPE
    model = make_model(input_shape, 5)

    memory = SequentialMemory(limit=100000, window_length=WINDOW_LENGTH)
    processor = SnakeProcessor()

    start_policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                                        attr='eps',
                                        value_max=0,
                                        value_min=0,
                                        value_test=0,
                                        nb_steps=500000)
    policy = BoltzmannQPolicy(tau=0.25)

    interval = 20000

    dqn = DQNAgent(model=model,
                   nb_actions=5,
                   policy=policy,
                   memory=memory,
                   processor=processor,
                   nb_steps_warmup=2000,
                   gamma=.99,
                   target_model_update=interval,
                   train_interval=4,
                   delta_clip=1.)

    dqn.compile(Adam(), metrics=['mae'])
    weights_filename = 'dqn_snake_weights.h5f'

    if not test:
        if os.path.exists('starting_weights.h5'):
            print('loadin!')
            model.load_weights('starting_weights.h5')
        # Okay, now it's time to learn something! We capture the interrupt exception so that training
        # can be prematurely aborted. Notice that now you can use the built-in Keras callbacks!
        weights_filename = 'dqn_{}_weights.h5f'.format('snake')
        checkpoint_weights_filename = 'dqn_' + 'snake' + '_weights_{step}.h5f'
        log_filename = 'dqn_{}_log.json'.format('snake')
        callbacks = [
            ModelIntervalCheckpoint(checkpoint_weights_filename,
                                    interval=interval)
        ]
        callbacks += [
            ModelIntervalCheckpoint(weights_filename, interval=interval)
        ]
        callbacks += [FileLogger(log_filename, interval=500)]
        callbacks += [WandbLogger(project="snake-rl")]
        dqn.fit(env,
                callbacks=callbacks,
                nb_steps=10000000,
                log_interval=10000,
                visualize=visualize_training,
                nb_max_start_steps=start_steps)

        # After training is done, we save the final weights one more time.
        # dqn.save_weights(weights_filename, overwrite=True)

        # Finally, evaluate our algorithm for 10 episodes.
        # dqn.test(env, nb_episodes=10, visualize=True, nb_max_episode_steps=100)
    else:
        while True:
            try:
                dqn.load_weights(weights_filename)
            except Exception:
                print("weights not found, waiting")
            dqn.test(env,
                     nb_episodes=10,
                     visualize=visualize_training,
                     nb_max_episode_steps=num_max_test)
            time.sleep(3)
Beispiel #9
0
class BetaFlapDQN(DQNAgent):
    def __init__(self, inputs, buffer, sess_id, sess, **kwargs):
        self.util = Utility()
        self.sess = sess
        self.sess_id = sess_id

        game = inputs['game']
        agnt = inputs['agent']
        sess = agnt['session']
        eps = sess['episode']
        mod = inputs['model']
        trn = mod['training']
        sv = mod['save']
        mem = inputs['memory']
        '''---Environment Paramters---'''
        self.env_name = game['name']
        self.fps = game['fps']
        self.mode = game['difficulty']
        self.target = game['target']
        self.tick = game['tick']
        '''---Episode Parameters---'''
        self.nb_episodes = sess['max_ep']
        self.nb_max_episode_steps = game['fps'] * 60 * eps['max_time']
        self.nb_steps = self.nb_max_episode_steps * self.nb_episodes
        self.nb_steps_warmup = trn['warmup']
        self.nb_max_start_steps = trn['max_ep_observe']
        self.max_start_steps = trn['warmup']
        self.keep_gif_score = eps['keep_gif_score']
        '''---Agent / Model Parameters---'''
        self.name = agnt['name']
        self.nb_actions = agnt['action_size']
        self.delta_clip = agnt['delta_clip']

        self.training = trn['training']
        self.verbose = trn['verbose']
        self.lr = trn['learn_rate']
        self.eps = trn['initial_epsilon']
        self.value_max = trn['initial_epsilon']
        self.value_min = trn['terminal_epsilon']
        self.anneal = trn['anneal']
        self.shuffle = trn['shuffle']
        self.train_interval = trn['interval']
        self.validate = trn['validate']
        self.split = trn['split']
        self.action_repetition = trn['action_repetition']
        self.epochs = trn['epochs']
        self.epoch = 1

        prec = km.binary_precision()
        re = km.binary_recall()
        f1 = km.binary_f1_score()
        self.metrics = ['accuracy', 'mse', prec, re, f1]
        self.H = mod['filter_size']
        self.alpha = mod['alpha']
        self.gamma = mod['gamma']
        self.momentum = mod['momentum']
        self.decay = mod['decay']
        self.target_model_update = mod['target_update']
        self.type = mod['type']
        self.enable_double_dqn = mod['double_dqn']
        self.enable_dueling_network = mod['dueling_network']
        self.dueling_type = mod['dueling_type']

        self.limit = mem['limit']
        self.batch_size = mem['batch_size']
        self.window_length = mem['state_size']
        self.memory_interval = mem['interval']

        self.ftype = sv['ftype']

        self.vizualize = sv['visualize']
        self.save_full = sv['save_full']
        self.save_weights = sv['save_weights']
        self.save_json = sv['save_json']
        self.save_plot = sv['save_plot']
        self.save_interval = sv['save_n']
        self.log_interval = sv['log_n']
        self.saves = sv['save_path']
        self.save_path = self.util.get_save_dir_struct(self.saves,
                                                       self.env_name)
        self.logs = sv['log_path']
        self.util.display_status('Hyperparameters Successfully Loaded')
        '''Reference/Excerpt:  keras-rl DQN Atari Example
        https://github.com/keras-rl/keras-rl/blob/master/examples/dqn_atari.py
        # Select a policy. 
        # We use eps-greedy action selection, which means that a random action
        # is selected with probability eps. We anneal eps from init to term over 
        # the course of (anneal) steps. This is done so that the agent initially 
        # explores the environment (high eps) and then gradually sticks to 
        # what it knows (low eps). We also set a dedicated eps value that is 
        # used during testing. Note that we set it to 0.05 so that the agent 
        # still performs some random actions. 
        # This ensures that the agent cannot get stuck.
        # '''
        self.custom_model_objects = {
            'S': self.window_length,
            'A': self.nb_actions,
            'H': self.H,
            'lr': self.lr,
            'name': self.name,
            'batch_size': self.batch_size,
            'sess': self.sess,
            #dueling_network=self.enable_dueling_network,
            #dueling_type=self.dueling_type,
        }

        with tf.device(gpu):
            self.policy = LinearAnnealedPolicy(
                inner_policy=EpsGreedyQPolicy(eps=self.value_max),
                attr='eps',
                value_max=self.value_max,
                value_min=self.value_min,
                value_test=self.alpha,
                nb_steps=self.anneal)
            self.test_policy = GreedyQPolicy()

            if mod['optimizer'].lower() == 'adamax':
                self.optimizer = Adamax(lr=self.lr)
            elif mod['optimizer'].lower() == 'adadelta':
                self.optimizer = Adadelta()
            elif mod['optimizer'].lower() == 'rmsprop':
                self.optimizer = RMSprop()
            elif mod['optimizer'].lower() == 'sgd':
                self.optimizer = SGD(
                    lr=self.lr,
                    momentum=self.momentum,
                    decay=self.decay,
                )
            else:
                self.optimizer = Adam(lr=self.lr)

        self.memory = buffer

        self.log_path = self.util.get_log_dir_struct(self.sess_id, self.logs,
                                                     self.ftype)

        self.util.display_status('Keras GPU Session {} Beginning'.format(
            self.sess_id))

        nn = NeuralNet(
            S=self.window_length,
            A=self.nb_actions,
            H=self.H,
            lr=self.lr,
            name=self.name,
            batch_size=self.batch_size,
            dueling_network=self.enable_dueling_network,
            dueling_type=self.dueling_type,
            sess=self.sess,
        )
        with tf.device(gpu):
            self.model = nn.get_model()

        self.util.display_status(
            '{} Keras Agent with {} Optimizer Built'.format(
                self.name, mod['optimizer']))
        '''---Compile the model with chosen optimizer
        loss is calculated with lamba function based on model
        type selections (dueling, or double dqn)'''
        with tf.device(gpu):
            self.compile(
                optimizer=self.optimizer,
                metrics=self.metrics,
            )

        self.util.display_status(
            '{} Agent Fully Initialized with Compiled Model'.format(self.name))

        super(BetaFlapDQN, self).__init__(
            model=self.model,
            nb_actions=self.nb_actions,
            memory=self.memory,
            policy=self.policy,
            test_policy=self.test_policy,
            enable_double_dqn=self.enable_double_dqn,
            enable_dueling_network=self.enable_dueling_network,
            dueling_type=self.dueling_type,
            **kwargs)

    def load_saved_model_weights(self):
        try:
            self.model.load_weights('saved/FlappyBird_weights.h5')
            self.util.display_status('Saved Keras Model Weights Loaded')
        except:
            self.util.display_status('No Saved Keras Model Weights Found')

    def fit(self, iteration=1, max_iteration=1):
        self.load_saved_model_weights()

        with tf.device(gpu):
            self.env = Environment(
                target_score=self.target,
                difficulty=self.mode,
                fps=self.fps,
                tick=self.tick,
            )
        self.util.display_status('{} Environment Emulation Initialized'.format(
            self.env_name))

        if self.action_repetition < 1:
            raise ValueError(
                'action_repetition must be >= 1, is {}'.\
                    format(self.action_repetition)
            )
        '''---Define Custom Callbacks and Processors BetaFlap'''
        FlappyCall = FlappySession()
        Flappy = FlappyProcessor()
        '''---Flag Agent with as Training with on_train_begin()'''
        self._on_train_begin()
        FlappyCall.on_train_begin()

        self.training = True
        observation = None
        reward = None
        done = False
        info = None
        status = 'play'
        episode = np.int16(0)
        self.step = np.int16(0)
        action = np.int16(0)
        self.randQ = np.int16(0)
        self.reward = np.float16(0)
        idx = np.int16(0)
        flap = False
        episode_reward = None
        episode_score = None
        episode_step = None
        did_abort = False
        '''---Begin stepping through Episodes---'''
        # continue while global step is < max session steps
        while self.step < self.nb_steps:
            gc.collect()
            if observation is None:  # new episode
                '''---Initialize Environment with No Action'''
                FlappyCall.on_episode_begin(episode)
                self.reset_states()  # reset all episode tracking parameters
                reward = None
                done = False
                info = {}
                action = None
                episode_step = np.int16(0)
                episode_score = np.int16(0)
                episode_reward = np.float32(0)

                wake = np.zeros([self.nb_actions])  # [0, 0]
                wake[0] = 1  # [1, 0] --> don't flap
                o, r, done, info = self.env.step(wake)  # progress env 1 frame
                observation, r = Flappy.process_step(o, r, done, info)
                assert observation is not None
                '''---Each episode, begin with n random actions/steps'''
                if self.nb_max_start_steps == 0:
                    self.nb_random_start_steps = 0
                else:
                    self.nb_random_start_steps = \
                    np.random.randint(self.nb_max_start_steps)
                '''---Perform random nb steps w/ rand action 
                      without adding them to experience replay memory'''
                for _ in range(self.nb_random_start_steps):
                    action = np.zeros([self.nb_actions])
                    randQ = rand.randrange(self.nb_actions)
                    action[randQ] = 1  # flag selected action
                    o, r, done, info = self.env.step(
                        action)  # progress env 1 frame
                    episode_step += 1
                    '''---Process output of randomized actions
                          without updating cumulative episode totals'''
                    observation = deepcopy(o)
                    observation, r = \
                        Flappy.process_step(observation, r, done, info)
                    if info['status'] == 'exit':
                        done = True
                        did_abort = True
                    if done: break
                # warmup period complete
                assert episode_reward is not None
                assert episode_step is not None
                assert observation is not None
                gc.collect()
            '''---Begin Iteratively Training Model Each Step
                * predict Q values / action (forward step)
                * use reward to improve the model (backward step)
            '''
            FlappyCall.on_step_begin(episode_step)
            '''---Predict Q Values Using Forward Method'''
            with tf.device(gpu):
                idx = self.forward(observation)
            action, flap = Flappy.process_action(idx, self.nb_actions)
            #episode_step += 1
            reward = np.float32(0)
            done = False
            for _ in range(self.action_repetition):
                o, r, d, i = self.env.step(action)
                observation = deepcopy(o)
                observation, r = Flappy.process_step(o, r, d, i)
                reward += r
                done = d
                info = i
                status = info['status']
                episode_step += 1
                if info['status'] == 'exit':
                    done = True
                    did_abort = True
                if done: break  # game over, end episode
            '''---Train the Model using Backward Method
            This function covers the bulk of the algorithm logic
                * store experience in memory
                * create experience batch, and predict Qs
                * train model on signle batch with selected optimizer
                * enable/disable double DQN or dueling network
                * update model target values
                * discount future reward and return model metrics
            '''
            with tf.device(gpu):
                metrics = self.backward(reward, terminal=done)
            episode_reward += reward
            self.reward = episode_reward
            episode_score = info['score']
            '''---Log Step Data---'''
            step_log = {
                'step': episode_step,  # track episode step nb
                'episode': episode,
                'metrics': metrics,
                'flap': flap,
                'action': action,
                'reward': reward,
                'done': done,
                'training': self.training,
                'q_values': self.q_values,
                'info': info,
                'x': o,
                'x_t': observation,
            }
            FlappyCall.on_step_end(episode_step, step_log)
            gc.collect()

            #episode_step += 1
            self.step += 1

            if (self.step % self.save_interval) == 0 \
            or status == 'save':
                self.save_model()
            if status == 'exit':
                done = True
                did_abort = True
            if self.nb_max_episode_steps and \
                (episode_step >= self.nb_max_episode_steps - 1):
                done = True  # max episode steps hit
            # We are in a terminal state but the agent hasn't yet seen it.
            # perform one more forward-backward call and ignore the action
            if done:
                with tf.device(gpu):
                    self.forward(observation)
                    self.backward(0., terminal=False)
                episode_log = {
                    'sess_id': self.sess_id,
                    'episode': episode,
                    'reward': episode_reward,
                    'score': episode_score,
                    'steps': episode_step,  # track global step nb   
                    'gif': self.keep_gif_score,
                    'log_path': self.logs,
                    'iteration': iteration,
                }
                '''Episode Complete, Proceed to Next Iteration'''
                FlappyCall.on_episode_end(episode, episode_log)

                episode += 1
                observation = None
                episode_step = None
                episode_reward = None
                episode_score = None
                gc.collect()

                if episode > self.nb_episodes or did_abort:
                    done = True  # max episode hit
                    break
        '''---Training Session Complete---'''
        self.save_model()
        session_log = {
            'id': self.sess_id,
            'nb_steps': self.step,
            'did_abort': did_abort
        }
        FlappyCall.on_train_end(session_log, self.sess_id, self.log_path)
        self._on_train_end()  # end training session
        if iteration >= max_iteration or did_abort:
            self.env.close()
            return True

    def forward(self, observation):
        # Select an action
        state = self.memory.get_recent_state(observation)
        with tf.device(gpu):
            self.q_values = self.compute_q_values(state)

        if self.training:  # LinearAnneal Greedy Epsilon
            with tf.device(gpu):
                action = self.policy.select_action(q_values=self.q_values)
        else:  #  GreedyQ
            with tf.device(gpu):
                action = self.test_policy.select_action(q_values=self.q_values)
        # Book-keeping for experience replay
        self.recent_observation = observation
        self.recent_action = action
        return action

    def backward(self, reward, terminal):
        '''Store latest step in experience replay tuple'''
        if self.step % self.memory_interval == 0 or self.reward > .011:
            if self.reward > .011:
                self.util.display_status(
                    'Step {} Replay Experience Memory Saved'.format(self.step))
            with tf.device(cpu):
                self.memory.append(np.array(self.recent_observation),
                                   np.int16(self.recent_action),
                                   np.float32(reward),
                                   terminal,
                                   training=self.training)
        metrics = []
        if not self.training:
            return metrics
        '''Begin Training on Batches of Stored Experiences'''
        if self.step > self.nb_steps_warmup \
        and self.step % self.train_interval == 0:
            with tf.device(gpu):
                batch = self.memory.sample(self.batch_size)
                assert len(batch) == self.batch_size

            state0_batch, reward_batch,action_batch, terminal1_batch, \
            state1_batch = \
                FlappyProcessor.process_state_batch(self, batch)

            assert reward_batch.shape == (self.batch_size, )
            assert terminal1_batch.shape == reward_batch.shape
            assert len(action_batch) == len(reward_batch)
            '''Compute the Q-Values for Mini-Batch of Samples
            "Deep Reinforcement Learning with Double Q-learning"
            (van Hasselt et al., 2015):
            Double DQN: 
                - online network predicts actions
                - target network estimates Q values.
            '''
            if self.enable_double_dqn:
                with tf.device(gpu):
                    q_values = self.model.predict_on_batch(state1_batch)
                assert q_values.shape == (self.batch_size, self.nb_actions)
                actions = np.argmax(q_values, axis=1)
                assert actions.shape == (self.batch_size, )
                # estimate Q values using the target network
                # select maxQ value with the online model (computed above)
                with tf.device(gpu):
                    target_q_values = \
                    self.target_model.predict_on_batch(state1_batch)

                assert target_q_values.shape == \
                    (self.batch_size, self.nb_actions)
                q_batch = target_q_values[range(self.batch_size), actions]
            # Compute the q_values for state1, compute maxQ of each sample
            # prediction done on target_model as outlined in Mnih (2015),
            # it makes the algorithm is significantly more stable
            else:
                with tf.device(gpu):
                    target_q_values = \
                    self.target_model.predict_on_batch(state1_batch)

                assert target_q_values.shape == \
                    (self.batch_size, self.nb_actions)
                q_batch = np.max(target_q_values, axis=1).flatten()
            assert q_batch.shape == (self.batch_size, )

            targets = np.zeros((self.batch_size, self.nb_actions))
            dummy_targets = np.zeros((self.batch_size, ))
            masks = np.zeros((self.batch_size, self.nb_actions))

            # Compute r_t + gamma * max_a Q(s_t+1, a)
            # update the affected output targets accordingly
            # Set discounted reward to zero for all states that were terminal
            discounted_reward_batch = self.gamma * q_batch
            discounted_reward_batch *= terminal1_batch
            assert discounted_reward_batch.shape == reward_batch.shape

            Rs = reward_batch + discounted_reward_batch
            for idx, (target, mask, R, action) in enumerate(
                    zip(targets, masks, Rs, action_batch)):
                target[action] = R  # update with estimated accumulated reward
                dummy_targets[idx] = R
                mask[action] = 1.  # enable loss for specific action
            targets = np.array(targets).astype('float32')
            masks = np.array(masks).astype('float32')
            '''Train Using Sample Experience Batch'''
            # perform a single update on the entire batch
            # use a dummy target, as loss is computed complex Lambda layer
            # still useful to know the target to compute metrics properly
            if type(self.model.input) is not list:
                ins = [state0_batch]
            else:
                state0_batch
            if self.validate:
                split = self.split
            else:
                split = 0

            with tf.device(gpu):
                metrics = self.trainable_model.train_on_batch(
                    ins + [targets, masks], [dummy_targets, targets])
                # THIS CAUSES A MEMORY LEAK IN CURRENT CONFIGURATION
                #metrics = self.trainable_model.fit(
                #    ins + [targets, masks],
                #    [dummy_targets, targets],
                #    batch_size=None,
                #    epochs=self.epochs,
                #    verbose=self.verbose,
                #    validation_split=split,
                #    shuffle=self.shuffle
                #)
                gc.collect()

            # throw away individual losses
            if type(metrics) is list:
                [m for idx, m in enumerate(metrics) if idx not in (1, 2)]
            else:
                metrics.history.update({'losses': self.policy.metrics})

        if self.target_model_update >= 1 \
        and self.step % self.target_model_update == 0:
            with tf.device(gpu):
                self.update_target_model_hard()
        return metrics

    def save_model(self):
        if self.save_full:
            '''---Save full model to single .h5 file---'''
            self.model.save(self.save_path + '_full.h5', overwrite=True)
            self.util.display_status('{} Model Saved to {}'.format(
                self.name, self.save_path + '_full.h5'))
        if self.save_weights:
            '''---Save model weights to separate .h5 file---'''
            self.model.save_weights(self.save_path + '_weights.h5',
                                    overwrite=True)
            self.util.display_status('{} Model Weights Saved to {}'.format(
                self.name, self.save_path + '_weights.h5'))
        if self.save_json:
            '''---Save model structure as JSON file---'''
            with open(self.save_path + '.json', 'a+') as f:
                json.dumps(self.model.to_json(), f)
            f.close()
            self.util.display_status('{} Model Structure Saved to {}'.format(
                self.name, self.save_path + '.json'))
        if self.save_plot:
            plot_model(self.model, to_file=self.save_path + '_flow.png')
            self.util.display_status(
                '{} Neural Network Diagram Saved to {}'.format(
                    self.name, self.save_path + '_flow.png'))
def training_game():
    env = Environment(
        map_name="HallucinIce",
        visualize=True,
        game_steps_per_episode=150,
        agent_interface_format=features.AgentInterfaceFormat(
            feature_dimensions=features.Dimensions(screen=64, minimap=32)))

    input_shape = (_SIZE, _SIZE, 1)
    nb_actions = _SIZE * _SIZE  # Should this be an integer

    model = neural_network_model(input_shape, nb_actions)
    # memory : how many subsequent observations should be provided to the network?
    memory = SequentialMemory(limit=5000, window_length=_WINDOW_LENGTH)

    processor = SC2Proc()

    ### Policy
    # Agent´s behaviour function. How the agent pick actions
    # LinearAnnealedPolicy is a wrapper that transforms the policy into a linear incremental linear solution . Then why im not see LAP with other than not greedy ?
    # EpsGreedyQPolicy is a way of selecting random actions with uniform distributions from a set of actions . Select an action that can give max or min rewards
    # BolztmanQPolicy . Assumption that it follows a Boltzman distribution. gives the probability that a system will be in a certain state as a function of that state´s energy??

    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                                  attr="eps",
                                  value_max=1,
                                  value_min=0.7,
                                  value_test=.0,
                                  nb_steps=1e6)
    # policy = (BoltzmanQPolicy( tau=1., clip= (-500,500)) #clip defined in between -500 / 500

    ### Agent
    # Double Q-learning ( combines Q-Learning with a deep Neural Network )
    # Q Learning -- Bellman equation

    dqn = DQNAgent(model=model,
                   nb_actions=nb_actions,
                   memory=memory,
                   nb_steps_warmup=500,
                   target_model_update=1e-2,
                   policy=policy,
                   batch_size=150,
                   processor=processor)

    dqn.compile(Adam(lr=.001), metrics=["mae"])

    ## Save the parameters and upload them when needed

    name = "HallucinIce"
    w_file = "dqn_{}_weights.h5f".format(name)
    check_w_file = "train_w" + name + "_weights.h5f"

    if SAVE_MODEL:
        check_w_file = "train_w" + name + "_weights_{step}.h5f"

    log_file = "training_w_{}_log.json".format(name)
    callbacks = [ModelIntervalCheckpoint(check_w_file, interval=1000)]
    callbacks += [FileLogger(log_file, interval=100)]

    if LOAD_MODEL:
        dqn.load_weights(w_file)

    dqn.fit(env,
            callbacks=callbacks,
            nb_steps=1e7,
            action_repetition=2,
            log_interval=1e4,
            verbose=2)

    dqn.save_weights(w_file, overwrite=True)
    dqn.test(env, action_repetition=2, nb_episodes=30, visualize=False)
def test1_dialogue_system():
    """
    Method for testing the GODialogueSys class for the movie booking data set
    """

    # the path to the act set and load it
    # TODO: change it with a relative path
    act_set_file_path = '/Users/vladimirilievski/Desktop/Vladimir/Master_Thesis_Swisscom/GitHub Repo/GO-Chatbots/resources/data/dia_acts.txt'
    act_set = util.text_to_dict(act_set_file_path)

    # the path to the slot set and load it
    # TODO: change it with a relative path
    slot_set_file_path = '/Users/vladimirilievski/Desktop/Vladimir/Master_Thesis_Swisscom/GitHub Repo/GO-Chatbots/resources/data/slot_set.txt'
    slot_set = util.text_to_dict(slot_set_file_path)

    # the path to the user goals and load it
    # TODO: change it with a relative path
    goal_set_file_path = '/Users/vladimirilievski/Desktop/Vladimir/Master_Thesis_Swisscom/GitHub Repo/GO-Chatbots/resources/data/user_goals_first_turn_template.part.movie.v1.p'
    goal_set = util.load_goal_set(goal_set_file_path)

    # the list of initial inform slots
    init_inform_slots = ['moviename']

    # the ultimate slot set
    ultimate_request_slot = 'ticket'

    kb_special_slots = ['numberofpeople']
    kb_filter_slots = ['ticket', 'numberofpeople', 'taskcomplete', 'closing']

    # feasible actions
    test_feasible_actions = test1_feasible_actions()

    # the agent memory
    agt_memory = SequentialMemory(limit=1000000, window_length=4)

    # the agent policy
    agt_policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05,
                                      nb_steps=1000000)

    # testing policy
    agt_test_policy = None

    # all system params
    params = {}
    params[const.MAX_NB_TURNS] = 30
    params[
        const.KB_PATH_KEY] = '/Users/vladimirilievski/Desktop/Vladimir/Master_Thesis_Swisscom/GitHub Repo/GO-Chatbots/resources/data/movie_kb.1k.p'

    # Environment params
    params[const.SIMULATION_MODE_KEY] = const.SEMANTIC_FRAME_SIMULATION_MODE
    params[const.IS_TRAINING_KEY] = True
    params[const.USER_TYPE_KEY] = const.RULE_BASED_USER
    params[const.STATE_TRACKER_TYPE_KEY] = const.RULE_BASED_STATE_TRACKER
    params[const.SUCCESS_REWARD_KEY] = 2 * params[const.MAX_NB_TURNS]
    params[const.FAILURE_REWARD_KEY] = - params[const.MAX_NB_TURNS]
    params[const.PER_TURN_REWARD_KEY] = -1

    params[
        const.NLU_PATH_KEY] = "/Users/vladimirilievski/Desktop/Vladimir/Master_Thesis_Swisscom/GitHub Repo/GO-Chatbots/resources/models/nlu/lstm_[1468447442.91]_39_80_0.921.p"

    params[
        const.DIAACT_NL_PAIRS_PATH_KEY] = '/Users/vladimirilievski/Desktop/Vladimir/Master_Thesis_Swisscom/GitHub Repo/GO-Chatbots/resources/data/dia_act_nl_pairs.v6.json'
    params[
        const.NLG_PATH_KEY] = "/Users/vladimirilievski/Desktop/Vladimir/Master_Thesis_Swisscom/GitHub Repo/GO-Chatbots/resources/models/nlg/lstm_tanh_relu_[1468202263.38]_2_0.610.p"

    # Agent params
    params[const.AGENT_TYPE_KEY] = const.AGENT_TYPE_DQN
    params[const.GAMMA_KEY] = .99
    params[const.BATCH_SIZE_KEY] = 32
    params[const.NB_STEPS_WARMUP_KEY] = 1000
    params[const.TRAIN_INTERVAL_KEY] = 1
    params[const.MEMORY_INTERVAL_KEY] = 1
    params[const.TARGET_MODEL_UPDATE_KEY] = 10000
    params[const.ENABLE_DOUBLE_DQN_KEY] = True
    params[const.ENABLE_DUELING_NETWORK_KEY] = False
    params[const.DUELING_TYPE_KEY] = 'avg'
    params[const.HIDDEN_SIZE_KEY] = 80
    params[const.ACTIVATION_FUNCTION_KEY] = const.RELU

    # create the dialogue system
    dialogue_sys = GODialogSys(act_set=act_set, slot_set=slot_set, goal_set=goal_set,
                               init_inform_slots=init_inform_slots, ultimate_request_slot=ultimate_request_slot,
                               kb_special_slots=kb_special_slots, kb_filter_slots=kb_filter_slots,
                               agt_feasible_actions=test_feasible_actions, agt_memory=agt_memory, agt_policy=agt_policy,
                               agt_test_policy=agt_test_policy, params=params)
Beispiel #12
0
# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=8000, window_length=1)
# memory = EpisodeParameterMemory(limit=500000, window_length=1)

# processor = AtariProcessor()

# Select a policy. We use eps-greedy action selection, which means that a random action is selected
# with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that
# the agent initially explores the environment (high eps) and then gradually sticks to what it knows
# (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05
# so that the agent still performs some random actions. This ensures that the agent cannot get stuck.
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                              attr='eps',
                              value_max=0.65,
                              value_min=0.05,
                              value_test=.05,
                              nb_steps=1000000)
# policy = GreedyQPolicy()

# The trade-off between exploration and exploitation is difficult and an on-going research topic.
# If you want, you can experiment with the parameters or use a different policy. Another popular one
# is Boltzmann-batchstyle exploration:
# policy = BoltzmannQPolicy(tau=1.)
# Feel free to give it a try!

processor = AutolabProcessor(nb_inputs=1)

dqn = DQNAgent(model=model,
               nb_actions=nb_actions,
               policy=policy,
Beispiel #13
0
def setup(difficulty_level='default', env_name = "AirSimEnv-v42"):
    #parser = argparse.ArgumentParser()
    #parser.add_argument('--mode', choices=['train', 'test'], default='train')
    #parser.add_argument('--env-name', type=str, default='AirSimEnv-v42')
    #parser.add_argument('--weights', type=str, default=None)
    #parser.add_argument('--difficulty-level', type=str, default="default") 
    #args = parser.parse_args()
    #args, unknown = parser.parse_known_args()
    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = 0.6
    config.gpu_options.allow_growth = True
    set_session(tf.Session(config=config))

    # Get the environment and extract the number of actions.
    #msgs.algo = "DQN"
    env = gym.make(env_name)
    env.init_again(eval("settings."+difficulty_level+"_range_dic"))
    env.airgym.unreal_reset() #must rest so the env accomodate the changes
    time.sleep(5)

    np.random.seed(123)
    env.seed(123)
    nb_actions = env.action_space.n
    
    WINDOW_LENGTH = 1
    depth_shape = env.depth.shape
    vel_shape = env.velocity.shape
    dst_shape = env.position.shape

    # Keras-rl interprets an extra dimension at axis=0
    # added on to our observations, so we need to take it into account
    img_kshape = (WINDOW_LENGTH,) + depth_shape

    # Sequential model for convolutional layers applied to image
    image_model = Sequential()
    if(settings.policy=='deep'):
        image_model.add(Conv2D(128,(3, 3), strides=(3, 3), padding='valid', activation='relu', input_shape=img_kshape,
                           data_format="channels_first"))
        image_model.add(Conv2D(64, (3, 3), strides=(2, 2), padding='valid', activation='relu'))
        image_model.add(Conv2D(32, (3, 3), strides=(1, 1), padding='valid', activation='relu'))
        image_model.add(Conv2D(32, (1, 1), strides=(1, 1), padding='valid', activation='relu'))

        image_model.add(Flatten())

        # plot_model(image_model, to_file="model_conv_depth.png", show_shapes=True)
        # Input and output of the Sequential model
        image_input = Input(img_kshape)
        encoded_image = image_model(image_input)

        # Inputs and reshaped tensors for concatenate after with the image
        velocity_input = Input((1,) + vel_shape)
        distance_input = Input((1,) + dst_shape)

        vel = Reshape(vel_shape)(velocity_input)
        dst = Reshape(dst_shape)(distance_input)

        # Concatenation of image, position, distance and geofence values.
        # 3 dense layers of 256 units
        denses = concatenate([encoded_image, vel, dst])
        denses = Dense(1024, activation='relu')(denses)
        denses = Dense(1024, activation='relu')(denses)
        denses = Dense(512, activation='relu')(denses)
        denses = Dense(128, activation='relu')(denses)
        denses = Dense(64, activation='relu')(denses)

    else:
        image_model.add(Conv2D(32, (4, 4), strides=(4, 4), padding='valid', activation='relu', input_shape=img_kshape,
                               data_format="channels_first"))
        image_model.add(Conv2D(64, (3, 3), strides=(2, 2), padding='valid', activation='relu'))
        image_model.add(Conv2D(128, (2, 2), strides=(1, 1), padding='valid', activation='relu'))
        image_model.add(Conv2D(64, (1, 1), strides=(1, 1), padding='valid', activation='relu'))

        image_model.add(Flatten())

        # plot_model(image_model, to_file="model_conv_depth.png", show_shapes=True)
        # Input and output of the Sequential model
        image_input = Input(img_kshape)
        encoded_image = image_model(image_input)

        # Inputs and reshaped tensors for concatenate after with the image
        velocity_input = Input((1,) + vel_shape)
        distance_input = Input((1,) + dst_shape)

        vel = Reshape(vel_shape)(velocity_input)
        dst = Reshape(dst_shape)(distance_input)

        # Concatenation of image, position, distance and geofence values.
        # 3 dense layers of 256 units
        denses = concatenate([encoded_image, vel, dst])
        denses = Dense(256, activation='relu')(denses)
        denses = Dense(256, activation='relu')(denses)
        denses = Dense(256, activation='relu')(denses)

    # Last dense layer with nb_actions for the output
    predictions = Dense(nb_actions, kernel_initializer='zeros', activation='linear')(denses)
    model = Model(
        inputs=[image_input, velocity_input, distance_input],
        outputs=predictions
    )
    env.set_model(model)
    print(model.summary())
    # plot_model(model,to_file="model.png", show_shapes=True)
    #train = True
    #train_checkpoint = False

    # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
    # even the metrics!
    memory = SequentialMemory(limit=100000, window_length=WINDOW_LENGTH)  # reduce memmory
    processor = MultiInputProcessor(nb_inputs=3)

    # Select a policy. We use eps-greedy action selection, which means that a random action is selected
    # with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that
    # the agent initially explores the environment (high eps) and then gradually sticks to what it knows
    # (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05c
    # so that the agent still performs some random actions. This ensures that the agent cannot get stuck.
    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=0.0,
                                  nb_steps=100000)

    dqn = DQNAgent(model=model, processor=processor, nb_actions=nb_actions, memory=memory, nb_steps_warmup=settings.nb_steps_warmup,
                   enable_double_dqn=settings.double_dqn,
                   enable_dueling_network=False, dueling_type='avg',
                   target_model_update=1e-2, policy=policy, gamma=.99)


    
    dqn.compile(Adam(lr=0.00025), metrics=['mae'])

    # Load the check-point weights and start training from there
    return dqn,env
Beispiel #14
0
# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=1000, window_length=WINDOW_LENGTH)
processor = AtariProcessor()

# Select a policy. We use eps-greedy action selection, which means that a random action is selected
# with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that
# the agent initially explores the environment (high eps) and then gradually sticks to what it knows
# (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05
# so that the agent still performs some random actions. This ensures that the agent cannot get stuck.
nb_steps = 1000
if args.weights:
    nb_steps = 1
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                              attr='eps',
                              value_max=.5,
                              value_min=0,
                              value_test=0,
                              nb_steps=nb_steps)

# The trade-off between exploration and exploitation is difficult and an on-going research topic.
# If you want, you can experiment with the parameters or use a different policy. Another popular one
# is Boltzmann-style exploration:
# policy = BoltzmannQPolicy(tau=1.)
# Feel free to give it a try!

nb_steps_warmup = 100
if args.weights:
    nb_steps_warmup = 100

dqn = DQNAgent(model=model,
               nb_actions=nb_actions,
Beispiel #15
0
model.add(Dense(10))
model.add(Activation('tanh'))
model.add(Dense(10))
model.add(Activation('tanh'))
model.add(Dense(10))
model.add(Activation('tanh'))
model.add(Dense(3))
model.add(Activation('linear'))
print(model.summary())

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(1000000, window_length=MEMORY_WINDOW_LENGTH)
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                              attr='eps',
                              value_max=1.,
                              value_min=.1,
                              value_test=.05,
                              nb_steps=INTERVAL)
dqn = DQNAgent(model=model,
               nb_actions=nb_actions,
               policy=policy,
               memory=memory,
               nb_steps_warmup=INTERVAL,
               gamma=.99,
               target_model_update=INTERVAL,
               train_interval=4,
               delta_clip=1.)
dqn.compile(Adam(lr=.00025), metrics=['mae'])

# Okay, now it's time to learn something! We capture the interrupt exception so that training
# can be prematurely aborted. Notice that now you can use the built-in Keras callbacks!
Beispiel #16
0
    def __init__(self, inputs, buffer, sess_id, sess, **kwargs):
        self.util = Utility()
        self.sess = sess
        self.sess_id = sess_id

        game = inputs['game']
        agnt = inputs['agent']
        sess = agnt['session']
        eps = sess['episode']
        mod = inputs['model']
        trn = mod['training']
        sv = mod['save']
        mem = inputs['memory']
        '''---Environment Paramters---'''
        self.env_name = game['name']
        self.fps = game['fps']
        self.mode = game['difficulty']
        self.target = game['target']
        self.tick = game['tick']
        '''---Episode Parameters---'''
        self.nb_episodes = sess['max_ep']
        self.nb_max_episode_steps = game['fps'] * 60 * eps['max_time']
        self.nb_steps = self.nb_max_episode_steps * self.nb_episodes
        self.nb_steps_warmup = trn['warmup']
        self.nb_max_start_steps = trn['max_ep_observe']
        self.max_start_steps = trn['warmup']
        self.keep_gif_score = eps['keep_gif_score']
        '''---Agent / Model Parameters---'''
        self.name = agnt['name']
        self.nb_actions = agnt['action_size']
        self.delta_clip = agnt['delta_clip']

        self.training = trn['training']
        self.verbose = trn['verbose']
        self.lr = trn['learn_rate']
        self.eps = trn['initial_epsilon']
        self.value_max = trn['initial_epsilon']
        self.value_min = trn['terminal_epsilon']
        self.anneal = trn['anneal']
        self.shuffle = trn['shuffle']
        self.train_interval = trn['interval']
        self.validate = trn['validate']
        self.split = trn['split']
        self.action_repetition = trn['action_repetition']
        self.epochs = trn['epochs']
        self.epoch = 1

        prec = km.binary_precision()
        re = km.binary_recall()
        f1 = km.binary_f1_score()
        self.metrics = ['accuracy', 'mse', prec, re, f1]
        self.H = mod['filter_size']
        self.alpha = mod['alpha']
        self.gamma = mod['gamma']
        self.momentum = mod['momentum']
        self.decay = mod['decay']
        self.target_model_update = mod['target_update']
        self.type = mod['type']
        self.enable_double_dqn = mod['double_dqn']
        self.enable_dueling_network = mod['dueling_network']
        self.dueling_type = mod['dueling_type']

        self.limit = mem['limit']
        self.batch_size = mem['batch_size']
        self.window_length = mem['state_size']
        self.memory_interval = mem['interval']

        self.ftype = sv['ftype']

        self.vizualize = sv['visualize']
        self.save_full = sv['save_full']
        self.save_weights = sv['save_weights']
        self.save_json = sv['save_json']
        self.save_plot = sv['save_plot']
        self.save_interval = sv['save_n']
        self.log_interval = sv['log_n']
        self.saves = sv['save_path']
        self.save_path = self.util.get_save_dir_struct(self.saves,
                                                       self.env_name)
        self.logs = sv['log_path']
        self.util.display_status('Hyperparameters Successfully Loaded')
        '''Reference/Excerpt:  keras-rl DQN Atari Example
        https://github.com/keras-rl/keras-rl/blob/master/examples/dqn_atari.py
        # Select a policy. 
        # We use eps-greedy action selection, which means that a random action
        # is selected with probability eps. We anneal eps from init to term over 
        # the course of (anneal) steps. This is done so that the agent initially 
        # explores the environment (high eps) and then gradually sticks to 
        # what it knows (low eps). We also set a dedicated eps value that is 
        # used during testing. Note that we set it to 0.05 so that the agent 
        # still performs some random actions. 
        # This ensures that the agent cannot get stuck.
        # '''
        self.custom_model_objects = {
            'S': self.window_length,
            'A': self.nb_actions,
            'H': self.H,
            'lr': self.lr,
            'name': self.name,
            'batch_size': self.batch_size,
            'sess': self.sess,
            #dueling_network=self.enable_dueling_network,
            #dueling_type=self.dueling_type,
        }

        with tf.device(gpu):
            self.policy = LinearAnnealedPolicy(
                inner_policy=EpsGreedyQPolicy(eps=self.value_max),
                attr='eps',
                value_max=self.value_max,
                value_min=self.value_min,
                value_test=self.alpha,
                nb_steps=self.anneal)
            self.test_policy = GreedyQPolicy()

            if mod['optimizer'].lower() == 'adamax':
                self.optimizer = Adamax(lr=self.lr)
            elif mod['optimizer'].lower() == 'adadelta':
                self.optimizer = Adadelta()
            elif mod['optimizer'].lower() == 'rmsprop':
                self.optimizer = RMSprop()
            elif mod['optimizer'].lower() == 'sgd':
                self.optimizer = SGD(
                    lr=self.lr,
                    momentum=self.momentum,
                    decay=self.decay,
                )
            else:
                self.optimizer = Adam(lr=self.lr)

        self.memory = buffer

        self.log_path = self.util.get_log_dir_struct(self.sess_id, self.logs,
                                                     self.ftype)

        self.util.display_status('Keras GPU Session {} Beginning'.format(
            self.sess_id))

        nn = NeuralNet(
            S=self.window_length,
            A=self.nb_actions,
            H=self.H,
            lr=self.lr,
            name=self.name,
            batch_size=self.batch_size,
            dueling_network=self.enable_dueling_network,
            dueling_type=self.dueling_type,
            sess=self.sess,
        )
        with tf.device(gpu):
            self.model = nn.get_model()

        self.util.display_status(
            '{} Keras Agent with {} Optimizer Built'.format(
                self.name, mod['optimizer']))
        '''---Compile the model with chosen optimizer
        loss is calculated with lamba function based on model
        type selections (dueling, or double dqn)'''
        with tf.device(gpu):
            self.compile(
                optimizer=self.optimizer,
                metrics=self.metrics,
            )

        self.util.display_status(
            '{} Agent Fully Initialized with Compiled Model'.format(self.name))

        super(BetaFlapDQN, self).__init__(
            model=self.model,
            nb_actions=self.nb_actions,
            memory=self.memory,
            policy=self.policy,
            test_policy=self.test_policy,
            enable_double_dqn=self.enable_double_dqn,
            enable_dueling_network=self.enable_dueling_network,
            dueling_type=self.dueling_type,
            **kwargs)
Beispiel #17
0
def training_game():
    env = Environment(
        map_name="CollectMineralShards",
        visualize=True,
        game_steps_per_episode=150,
        agent_interface_format=features.AgentInterfaceFormat(
            feature_dimensions=features.Dimensions(screen=64, minimap=32)))

    input_shape = (_SIZE, _SIZE, 1)
    nb_actions = 12  # Number of actions

    model = neural_network_model(input_shape, nb_actions)
    memory = SequentialMemory(limit=5000, window_length=_WINDOW_LENGTH)

    processor = SC2Proc()

    # Policy

    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                                  attr="eps",
                                  value_max=1,
                                  value_min=0.2,
                                  value_test=.0,
                                  nb_steps=1e2)

    # Agent

    dqn = DQNAgent(
        model=model,
        nb_actions=nb_actions,
        memory=memory,
        enable_double_dqn=True,
        enable_dueling_network=True,
        # 2019-07-12 GU Zhan (Sam) when value shape problem, reduce nb_steps_warmup:
        #                   nb_steps_warmup=300, target_model_update=1e-2, policy=policy,
        nb_steps_warmup=500,
        target_model_update=1e-2,
        policy=policy,
        batch_size=150,
        processor=processor,
        delta_clip=1)

    dqn.compile(Adam(lr=.001), metrics=["mae", "acc"])

    # Tensorboard callback

    timestamp = f"{datetime.datetime.now():%Y-%m-%d %I:%M%p}"
    # 2019-07-12 GU Zhan (Sam) folder name for Lunux:
    #    callbacks = keras.callbacks.TensorBoard(log_dir='./Graph/'+ timestamp, histogram_freq=0,
    #                                write_graph=True, write_images=False)

    # 2019-07-12 GU Zhan (Sam) folder name for Windows:
    callbacks = keras.callbacks.TensorBoard(log_dir='.\Graph\issgz',
                                            histogram_freq=0,
                                            write_graph=True,
                                            write_images=False)

    # Save the parameters and upload them when needed

    name = "agent"
    w_file = "dqn_{}_weights.h5f".format(name)
    check_w_file = "train_w" + name + "_weights.h5f"

    if SAVE_MODEL:
        check_w_file = "train_w" + name + "_weights_{step}.h5f"

    log_file = "training_w_{}_log.json".format(name)

    if LOAD_MODEL:
        dqn.load_weights(w_file)

    class Saver(Callback):
        def on_episode_end(self, episode, logs={}):
            if episode % 200 == 0:
                self.model.save_weights(w_file, overwrite=True)

    s = Saver()
    logs = FileLogger('DQN_Agent_log.csv', interval=1)

    dqn.fit(env,
            callbacks=[callbacks, s, logs],
            nb_steps=600,
            action_repetition=2,
            log_interval=1e4,
            verbose=2)

    dqn.save_weights(w_file, overwrite=True)
    dqn.test(env, action_repetition=2, nb_episodes=30, visualize=False)
Beispiel #18
0
async def main():
    env_player = SimpleRLPlayer(server_configuration=ServerConfiguration(
        "localhost:8000", "https://play.pokemonshowdown.com/action.php?"), )

    #opponent = RandomPlayer(player_configuration=PlayerConfiguration("USCPokebot", "uscpokebot"),
    #server_configuration= ServerConfiguration("localhost:8000",
    #"https://play.pokemonshowdown.com/action.php?"),)
    #second_opponent = MaxDamagePlayer(battle_format="gen8randombattle")

    # Output dimension
    n_action = len(env_player.action_space)

    model = Sequential()
    model.add(Dense(128, activation="elu", input_shape=(1, 12)))

    # Our embedding have shape (1, 10), which affects our hidden layer
    # dimension and output dimension
    # Flattening resolve potential issues that would arise otherwise
    model.add(Flatten())
    model.add(Dense(128, activation="elu"))
    model.add(Dense(128, activation="elu"))
    model.add(Dense(64, activation="elu"))
    model.add(Dense(n_action, activation="linear"))

    memory = SequentialMemory(limit=10000, window_length=1)

    # Ssimple epsilon greedy
    policy = LinearAnnealedPolicy(
        EpsGreedyQPolicy(),
        attr="eps",
        value_max=1.0,
        value_min=0.05,
        value_test=0,
        nb_steps=10000,
    )
    loaded_model = tf.keras.models.load_model('model_30000')
    loaded_model.load_weights('weights_DQN_30000.h5')
    # Defining our DQN
    dqn = DQNAgent(
        model=loaded_model,
        nb_actions=len(env_player.action_space),
        policy=policy,
        memory=memory,
        nb_steps_warmup=1000,
        gamma=0.5,
        target_model_update=1,
        delta_clip=0.01,
        enable_double_dqn=True,
    )

    dqn.compile(Adam(lr=0.00025), metrics=["mae"])

    #model.load_weights('weights_DQN.h5')
    # Evaluation
    class EmbeddedRLPlayer(Player):
        def choose_move(self, battle):
            if np.random.rand() < 0.01:  # avoids infinite loops
                return self.choose_random_move(battle)
            embedding = SimpleRLPlayer.embed_battle(self, battle)
            action = dqn.forward(embedding)
            return SimpleRLPlayer._action_to_move(self, action, battle)

    #player_configuration=PlayerConfiguration("USCPokebot", "uscpokebot"),
    emb_player = EmbeddedRLPlayer(
        player_configuration=PlayerConfiguration("CSCI527Bot", "CSCI527Bot"),
        server_configuration=ServerConfiguration(
            "sim.smogon.com:8000",
            "https://play.pokemonshowdown.com/action.php?"),
    )
    await emb_player.ladder(50)
Beispiel #19
0
else:
    plot_class = None

vqae = None
if args.use_vqae:
    # Initialize VQAE
    vqae = Autoencoder(plot_class=plot_class)

# Initialize processor
processor = AtariProcessor(autoencoder=vqae, plot_class=plot_class)

if args.agent_type == 'DDQN':
    # Setup exploration policy
    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                                  attr='eps',
                                  value_max=opt.eps_value_max,
                                  value_min=opt.eps_value_min,
                                  value_test=opt.eps_value_test,
                                  nb_steps=opt.eps_decay_steps)
    if opt.use_quantized_observations:
        agent = TabularQAgent(num_states=opt.state_vector_length,
                              num_actions=env.action_space.n,
                              policy=policy,
                              test_policy=policy,
                              processor=processor)
    else:
        # Setup DQN agent
        if opt.recurrent:
            model = DRQN_Model(window_length=opt.dqn_window_length,
                               num_actions=env.action_space.n)
        else:
            model = DQN_Model(window_length=opt.dqn_window_length,
Beispiel #20
0
def main(model_name, options):

    # Initialize maze environments.
    env = gym.make('Pong-v0')
    # env = gym.make('CartPole-v0')
    #env = gym.make('Taxi-v2')

    envs = [env]

    # Setting hyperparameters.
    nb_actions = env.action_space.n
    maze_dim = (6400, 1)
    h_size = 64  # For DQN
    e_t_size = 64  #For MQN / RMQN
    context_size = 64
    nb_steps_warmup = int(1e5)
    nb_steps = int(4e15)
    buffer_size = 8e4
    learning_rate = 0.003
    target_model_update = 0.999
    clipnorm = 10.
    switch_rate = 50
    window_length = 12
    memory_size = None

    # Callbacks
    log = TrainEpisodeLogger()
    #tensorboard = TensorBoard(log_dir="./logs/{}".format(model_name))
    rl_tensorboard = RLTensorBoard(log_dir="./logs/{}".format(model_name),
                                   histogram_freq=100)

    callbacks = [log, rl_tensorboard]

    ### Models ###
    model = None
    target_model = None

    # MQN model.
    if "MQN" in options:
        memory_size = 12
        model = MQNmodel(e_t_size, context_size, memory_size, window_length,
                         nb_actions, maze_dim)
        target_model = MQNmodel(e_t_size, context_size, memory_size,
                                window_length, nb_actions, maze_dim)

    # RMQN model.
    if "RMQN" in options:
        memory_size = 12
        model = RMQNmodel(e_t_size, context_size, memory_size, window_length,
                          nb_actions, maze_dim)
        target_model = RMQNmodel(e_t_size, context_size, memory_size,
                                 window_length, nb_actions, maze_dim)

    # Distributional MQN model.
    nb_atoms = 51
    v_min = -2.
    v_max = 2.
    #model = DistributionalMQNModel(e_t_size, context_size, window_length, nb_actions, nb_atoms, obs_dimensions)
    #target_model = DistributionalMQNModel(e_t_size, context_size, window_length, nb_actions, nb_atoms, obs_dimensions)

    # DQN model
    if "DQN" in options:
        model = DQNmodel(nb_actions, window_length, h_size, maze_dim)
        target_model = DQNmodel(nb_actions, window_length, h_size, maze_dim)

    # Initialize our target model with the same weights as our model.
    target_model.set_weights(model.get_weights())

    # Initialize memory buffer for DQN algorithm.
    experience = [
        SequentialMemory(limit=int(buffer_size / len(envs)),
                         window_length=window_length) for i in range(len(envs))
    ]

    # Learning policy where we initially begin training our agent by making random moves
    # with a probability of 1, and linearly decrease that probability down to 0.1 over the
    # course of some arbitrary number of steps. (nb_steps)
    policy = LinearAnnealedPolicy(inner_policy=EpsGreedyQPolicy(),
                                  attr="eps",
                                  value_max=1.0,
                                  value_min=0.1,
                                  value_test=0.,
                                  nb_steps=1e5)

    # Optional processor.
    processor = PongProcessor()
    # processor = MazeProcessor()

    # Initialize and compile the DQN agent.

    dqn = DQNAgent(model=model,
                   target_model=target_model,
                   nb_actions=nb_actions,
                   memory=experience,
                   nb_steps_warmup=nb_steps_warmup,
                   target_model_update=target_model_update,
                   policy=policy,
                   processor=processor,
                   batch_size=8)

    #Initialize experimental Distributional DQN Agent
    '''
    dqn = DistributionalDQNAgent(
        model=model,
        target_model=target_model,
        num_atoms=nb_atoms,
        v_min=v_min,
        v_max=v_max,
        nb_actions=nb_actions,
        memory=experience,
        nb_steps_warmup=nb_steps_warmup,
        target_model_update=target_model_update,
        policy=policy,
        #processor=processor,
        batch_size=32
    )
    '''

    # Compile the agent to check for validity, build tensorflow graph, etc.
    dqn.compile(RMSprop(lr=learning_rate, clipnorm=clipnorm), metrics=["mae"])

    # Weights will be loaded if weight file exists.
    if os.path.exists("data/{}/{}".format(model_name, model_name + ".h5")):
        dqn.load_weights("data/{}/{}".format(model_name, model_name + ".h5"))

    # Train DQN in environment.
    if "train" in options:
        dqn.fit(env, nb_steps=nb_steps, verbose=0, callbacks=callbacks)

        # Visualization / Logging Tools
        logmetrics(log, model_name)
        logHyperparameters(model_name,
                           e_t_size=e_t_size,
                           context_size=context_size,
                           h_size=h_size,
                           memory_size=memory_size,
                           learning_rate=learning_rate,
                           target_model_update=target_model_update,
                           clipnorm=clipnorm,
                           window_length=window_length,
                           nb_atoms=nb_atoms,
                           v_min=v_min,
                           v_max=v_max)

        # Save weights.
        dqn.save_weights("data/{}/{}".format(model_name, model_name + ".h5"))

    # Test DQN in environment.
    if "test" in options:
        dqn.test(env, nb_episodes=100, visualize=True)

    #Debugging
    if "debug" in options:
        observation = env.reset()
        outputLayer(dqn.model, np.array(experience[0].sample(32)[0].state0))
        #visualizeLayer(dqn.model, dqn.layers[1], observation)

    return
Beispiel #21
0
    def __init__(self, *args, **kwargs):
        super(Tester, self).__init__(*args, **kwargs)
        self.agent_name = 'iqn'
        self.verbose = False

        if self.agent_name == 'iqn':
            self.nb_quantiles = 32
            self.model = NetworkMLPDistributional(
                nb_inputs=10,
                nb_outputs=4,
                nb_hidden_layers=2,
                nb_hidden_neurons=100,
                nb_quantiles=self.nb_quantiles,
                nb_cos_embeddings=64,
                duel=True,
                prior=False,
                activation='relu',
                duel_type='avg',
                window_length=1).model
            self.policy = LinearAnnealedPolicy(
                DistributionalEpsGreedyPolicy(eps=None),
                attr='eps',
                value_max=1.,
                value_min=0.1,
                value_test=.0,
                nb_steps=10000)
            self.test_policy = DistributionalEpsGreedyPolicy(eps=0)
            self.memory = SequentialMemory(limit=10000, window_length=1)
            self.agent = IQNAgent(model=self.model,
                                  policy=self.policy,
                                  test_policy=self.test_policy,
                                  enable_double_dqn=True,
                                  nb_samples_policy=self.nb_quantiles,
                                  nb_sampled_quantiles=self.nb_quantiles,
                                  cvar_eta=1,
                                  nb_actions=4,
                                  memory=self.memory,
                                  gamma=0.99,
                                  batch_size=48,
                                  nb_steps_warmup=1000,
                                  train_interval=1,
                                  memory_interval=1,
                                  target_model_update=1000,
                                  delta_clip=1)
        elif self.agent_name == 'dqn':
            self.model = NetworkMLP(nb_inputs=10,
                                    nb_outputs=4,
                                    nb_hidden_layers=2,
                                    nb_hidden_neurons=100,
                                    duel=True,
                                    prior=False,
                                    activation='relu',
                                    duel_type='avg',
                                    window_length=1).model
            self.policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                                               attr='eps',
                                               value_max=1.,
                                               value_min=0.1,
                                               value_test=.0,
                                               nb_steps=10000)
            self.test_policy = EpsGreedyQPolicy(eps=0)
            self.memory = SequentialMemory(limit=10000, window_length=1)
            self.agent = DQNAgent(model=self.model,
                                  policy=self.policy,
                                  test_policy=self.test_policy,
                                  enable_double_dqn=True,
                                  nb_actions=4,
                                  memory=self.memory,
                                  gamma=0.99,
                                  batch_size=48,
                                  nb_steps_warmup=1000,
                                  train_interval=1,
                                  memory_interval=1,
                                  target_model_update=1000,
                                  delta_clip=1)
Beispiel #22
0
    model.add(Activation('relu'))
    model.add(Flatten())
    for _ in range(args.num_layers):
        model.add(Dense(args.num_units))
        model.add(Activation('relu'))
    model.add(Dense(nb_actions * 2,
                    bias_initializer=custom_initializer))  # mean and SD
    model.add(Activation('linear'))
    print(model.summary())

    memory = SequentialMemory(limit=args.memory_size,
                              window_length=WINDOW_LENGTH)
    processor = AtariProcessor()
    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                                  attr='eps',
                                  value_max=args.eps_max,
                                  value_min=args.eps_min,
                                  value_test=.05,
                                  nb_steps=1000000)
    test_policy = EpsGreedyQPolicy(eps=0.05)

    if bool(args.double_dqn):
        print("DOUBLE DQN")
    if bool(args.dueling):
        print("DUELING NETWORK")

    adfq = ADFQAgent(model=model,
                     nb_actions=nb_actions,
                     policy=policy,
                     test_policy=test_policy,
                     memory=memory,
                     processor=processor,
Beispiel #23
0
    n_action = len(env_player.action_space)

    model = Sequential()
    model.add(Dense(128, activation="elu", input_shape=(1, 10)))

    model.add(Flatten())
    model.add(Dense(64, activation="elu"))
    model.add(Dense(n_action, activation="linear"))

    memory = SequentialMemory(limit=10000, window_length=1)

    policy = LinearAnnealedPolicy(
        EpsGreedyQPolicy(),
        attr="eps",
        value_max=1.0,
        value_min=0.05,
        value_test=0,
        nb_steps=10000,
    )

    dqn = DQNAgent(
        model=model,
        nb_actions=len(env_player.action_space),
        policy=policy,
        memory=memory,
        nb_steps_warmup=1000,
        gamma=0.5,
        target_model_update=1,
        delta_clip=0.01,
        enable_double_dqn=True,
    )
Beispiel #24
0
env.seed(123)

# Next, we build a very simple model.
model = NETWORK(obs_shape, nb_actions)
print(model.summary())


memory = SequentialMemory(
  limit=MEMORY_SIZE, 
  window_length=1
)

policy = LinearAnnealedPolicy(
  EpsGreedyQPolicy(),
  attr='eps', 
  value_max=EPS_MAX,
  value_min=EPS_MIN,
  value_test=EPS_TEST,
  nb_steps=EPS_DECAY_STEPS
)

dqn = DQNAgent(
  model=model, 
  gamma=GAMMA,
  nb_actions=nb_actions, 
  memory=memory, 
  nb_steps_warmup=1000,
  target_model_update=TARGET_MODEL_UPDATE,
  policy=policy,
  test_policy=policy,
  enable_double_dqn=DOUBLE_DQN
)