コード例 #1
0
def main(params=None):
    """
    performs training and evaluation of params
    :return: model
    """
    if params is None:
        params = {
            'model_type': 'dqn_agent',
            'l1_out': 128,
            'l2_out': 64,
            'gamma': 0.5,
            'target_model_update': 1,
            'delta_clip': 0.01,
            'nb_steps_warmup': 1000
        }

    model_type = 'dqn_agent'
    env_player = SimpleRLPlayer(battle_format="gen8randombattle")
    # print('env_player',env_player)
    # print('help', help(env_player))
    env_player2 = SimpleRLPlayer(battle_format="gen8randombattle")

    opponent = RandomPlayer(battle_format="gen8randombattle")
    second_opponent = MaxDamagePlayer(battle_format="gen8randombattle")

    # Output dimension
    n_action = len(env_player.action_space)

    # model_params = {
    #     'n_actions': n_action,
    #     'l1_out': 128,
    #     'l2_out': 64,
    #     'model_type': params['model_type']
    # }
    model_params = params
    model_params['n_actions'] = n_action

    model = get_model(model_params)

    # print('first model summary')
    # print(model.summary())
    # model = Sequential()
    # model.add(Dense(128, activation="elu", input_shape=(1, 10)))
    #
    # # Our embedding have shape (1, 10), which affects our hidden layer
    # # dimension and output dimension
    # # Flattening resolve potential issues that would arise otherwise
    # model.add(Flatten())
    # model.add(Dense(64, activation="elu"))
    # model.add(Dense(n_action, activation="linear"))

    # elu activation is similar to relu
    # https://ml-cheatsheet.readthedocs.io/en/latest/activation_functions.html#elu

    # determine memory type
    if params['model_type'] in {'dqn_agent', 'sarsa_agent'}:
        # memory = SequentialMemory(limit=10000, window_length=1)
        memory = SequentialMemory(limit=NB_TRAINING_STEPS, window_length=1)
    else:
        memory = EpisodeParameterMemory(limit=10000, window_length=1)

    # Simple epsilon greedy
    # What is linear annealed policy?
    # - this policy gives gradually decreasing thresholds for the epsilon greedy policy
    # - it acts as a wrapper around epsilon greedy to feed in a custom threshold
    pol_steps = NB_TRAINING_STEPS
    policy = LinearAnnealedPolicy(
        EpsGreedyQPolicy(),
        attr="eps",
        value_max=1.0,
        value_min=0.05,
        value_test=0,
        nb_steps=pol_steps,
    )
    # pol_steps = NB_TRAINING_STEPS
    policy_boltz = BoltzmannQPolicy(tau=1)
    # policy = LinearAnnealedPolicy(
    #     BoltzmannQPolicy(),
    #     attr="tau",
    #     value_max=1.0,
    #     value_min=0.05,
    #     value_test=0,
    #     nb_steps=pol_steps,
    # )
    policy = policy_boltz

    # Defining our DQN
    # model = tf.keras.models.load_model('dqn_v_dqn')

    if params['model_type'] == 'dqn_agent':
        dqn = DQNAgent(
            model=model,
            nb_actions=len(env_player.action_space),
            policy=policy,
            memory=memory,
            nb_steps_warmup=params['nb_steps_warmup'],
            gamma=params['gamma'],
            target_model_update=params['target_model_update'],
            # delta_clip=0.01,
            delta_clip=params['delta_clip'],
            enable_double_dqn=params['enable_double_dqn__'],
            enable_dueling_network=params['enable_double_dqn__'],
            dueling_type=params['dueling_type__'])
        dqn.compile(Adam(lr=0.00025), metrics=["mae"])

    elif params['model_type'] == 'sarsa_agent':
        dqn = SARSAAgent(model=model,
                         nb_actions=len(env_player.action_space),
                         policy=policy,
                         nb_steps_warmup=params['nb_steps_warmup'],
                         gamma=params['gamma'],
                         delta_clip=params['delta_clip'])
        dqn.compile(Adam(lr=0.00025), metrics=["mae"])
    else:
        # CEMAgent
        # https://towardsdatascience.com/cross-entropy-method-for-reinforcement-learning-2b6de2a4f3a0
        dqn = CEMAgent(model=model,
                       nb_actions=len(env_player.action_space),
                       memory=memory,
                       nb_steps_warmup=params['nb_steps_warmup'])
        # different compile function
        dqn.compile()

    # dqn.compile(Adam(lr=0.00025), metrics=["mae"])
    # opponent dqn
    dqn_opponent = DQNAgent(
        model=model,
        nb_actions=len(env_player.action_space),
        policy=policy,
        memory=memory,
        nb_steps_warmup=params['nb_steps_warmup'],
        gamma=params['gamma'],
        target_model_update=params['target_model_update'],
        # delta_clip=0.01,
        delta_clip=params['delta_clip'],
        enable_double_dqn=params['enable_double_dqn__'],
        enable_dueling_network=params['enable_double_dqn__'],
        dueling_type=params['dueling_type__'])
    dqn_opponent.compile(Adam(lr=0.00025), metrics=["mae"])
    # NB_TRAINING_STEPS = NB_TRAINING_STEPS

    # rl_opponent = TrainedRLPlayer(model)
    # Training
    rounds = 4
    n_steps = NB_TRAINING_STEPS // rounds

    for k in range(rounds):
        env_player.play_against(
            env_algorithm=dqn_training,
            opponent=opponent,
            env_algorithm_kwargs={
                "dqn": dqn,
                "nb_steps": n_steps
            },
        )
        env_player.play_against(
            env_algorithm=dqn_training,
            opponent=second_opponent,
            env_algorithm_kwargs={
                "dqn": dqn,
                "nb_steps": n_steps
            },
        )

    name = params["name"] + "_model"
    model.save(name)

    # loaded_model = tf.keras.models.load_model(name)

    # Evaluation
    print("Results against random player:")
    env_player.play_against(
        env_algorithm=dqn_evaluation,
        opponent=opponent,
        env_algorithm_kwargs={
            "dqn": dqn,
            "nb_episodes": NB_EVALUATION_EPISODES
        },
    )

    print("\nResults against max player:")
    env_player.play_against(
        env_algorithm=dqn_evaluation,
        opponent=second_opponent,
        env_algorithm_kwargs={
            "dqn": dqn,
            "nb_episodes": NB_EVALUATION_EPISODES
        },
    )

    return model
コード例 #2
0
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=5000, window_length=1)
policy = BoltzmannQPolicy()
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
               target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
dqn.fit(env, nb_steps=3000, verbose=2)

# After training is done, we save the final weights.
dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

# Finally, evaluate our algorithm for 5 episodes.
dqn.test(env, nb_episodes=5)
コード例 #3
0
def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, 
                  nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)
    return dqn
コード例 #4
0
ファイル: rl_agent.py プロジェクト: PeterQuinn396/SC2-AI
def main(unused_argv):
    try:
        while True:
            with sc2_env.SC2Env(
                    map_name="MoveToBeacon",
                    players=[sc2_env.Agent(sc2_env.Race.terran)],
                    agent_interface_format=features.AgentInterfaceFormat(
                        feature_dimensions=features.Dimensions(screen=84,
                                                               minimap=64),
                        # default size of feature screen and feature minimap
                        use_feature_units=True),
                    step_mul=
                    64,  # 16 gives roughly 150 apm (8 would give 300 apm)
                    # larger num here makes it run faster
                    game_steps_per_episode=0,
                    visualize=True) as env:
                # create a keras-rl env
                keras_env = PySC2ToKerasRL_env(env)
                obs = keras_env.reset()

                # create an agent that can interact with

                # Test Agent (makes marine run in circle)
                # keras_agent = MoveToBeacon_KerasRL()
                # keras_agent.reset()
                # while True: #play the game
                #
                #     step_actions = keras_agent.step(obs)
                #     obs, reward, done, info = keras_env.step(step_actions)

                # Replace simple agent with a learning one
                # A simple model (taken from Keras-RL cartpole dqn)
                nb_actions = keras_env.action_space.n
                model = Sequential()
                model.add(
                    Flatten(input_shape=(1, ) +
                            keras_env.observation_space.shape))
                model.add(Dense(16))
                model.add(Activation('relu'))
                model.add(Dense(16))
                model.add(Activation('relu'))
                model.add(Dense(16))
                model.add(Activation('relu'))
                model.add(Dense(nb_actions))
                model.add(Activation('linear'))
                print(model.summary())
                output_filename = "DQN_Rewards_smallerObs_smallerActions.csv"

                #some other model
                # model = Sequential()
                # model.add(Flatten(input_shape=(1,) + keras_env.observation_space.shape))
                # model.add(Dense(16))
                # model.add(Activation('relu'))
                # model.add(Dense(16))
                # model.add(Activation('relu'))
                # model.add(Dense(16))
                # model.add(Activation('relu'))
                # model.add(Dense(nb_actions))
                # model.add(Activation('linear'))
                # print(model.summary())
                #output_filename = "DQN Rewards.csv"

                # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
                # even the metrics!
                memory = SequentialMemory(limit=50000, window_length=1)
                policy = BoltzmannQPolicy()
                dqn = DQNAgent(model=model,
                               nb_actions=nb_actions,
                               memory=memory,
                               nb_steps_warmup=15,
                               target_model_update=1e-2,
                               policy=policy)
                dqn.compile(Adam(lr=1e-3), metrics=['mae'])

                # Okay, now it's time to learn something! (hopefully)

                hist = dqn.fit(keras_env,
                               nb_steps=50000,
                               visualize=False,
                               verbose=2)

                with open(output_filename, 'w+',
                          newline='') as csvfile:  #save the rewards over time
                    writer = csv.writer(csvfile)
                    writer.writerow(hist.history.get('episode_reward'))
                break  #kill the env

    except KeyboardInterrupt:
        pass
コード例 #5
0
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))


# In[ ]:


model.summary()


# In[ ]:


memory = SequentialMemory(limit=2000, window_length=1)
policy = BoltzmannQPolicy(tau=1.)
#dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=100,
#               target_model_update=1e-2, policy=policy)
dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory,
                nb_steps_warmup=1000, gamma=.99, target_model_update=10000,
               train_interval=4, delta_clip=1.)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])


# In[ ]:



ENV_NAME="aftersubmissionv19"
weights_filename = 'dqn_{}_weights.h5f'.format(ENV_NAME)
checkpoint_weights_filename = 'dqn_' + ENV_NAME + '_weights_{step}.h5f'
コード例 #6
0
def main(shape=10,
         winsize=4,
         test=False,
         num_max_test=200,
         visualize_training=False,
         start_steps=0,
         randseed=None,
         human_mode_sleep=0.02):
    INPUT_SHAPE = (shape, shape)
    WINDOW_LENGTH = winsize

    class SnakeProcessor(Processor):
        def process_observation(self, observation):
            # assert observation.ndim == 1, str(observation.shape)  # (height, width, channel)
            assert observation.shape == INPUT_SHAPE
            return observation.astype(
                'uint8')  # saves storage in experience memory

        def process_state_batch(self, batch):
            # We could perform this processing step in `process_observation`. In this case, however,
            # we would need to store a `float32` array instead, which is 4x more memory intensive than
            # an `uint8` array. This matters if we store 1M observations.
            processed_batch = batch.astype('float32') / 255.
            return processed_batch

        def process_reward(self, reward):
            return reward

    try:
        randseed = int(randseed)
        print(f"set seed to {randseed}")
    except Exception:
        print(f"failed to intify seed of {randseed}, making it None")
        randseed = None

    env = gym.make('snakenv-v0',
                   gs=shape,
                   seed=randseed,
                   human_mode_sleep=human_mode_sleep)
    np.random.seed(123)
    env.seed(123)

    input_shape = (WINDOW_LENGTH, ) + INPUT_SHAPE
    model = make_model(input_shape, 5)

    memory = SequentialMemory(limit=100000, window_length=WINDOW_LENGTH)
    processor = SnakeProcessor()

    start_policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                                        attr='eps',
                                        value_max=0,
                                        value_min=0,
                                        value_test=0,
                                        nb_steps=500000)
    policy = BoltzmannQPolicy(tau=0.25)

    interval = 20000

    dqn = DQNAgent(model=model,
                   nb_actions=5,
                   policy=policy,
                   memory=memory,
                   processor=processor,
                   nb_steps_warmup=2000,
                   gamma=.99,
                   target_model_update=interval,
                   train_interval=4,
                   delta_clip=1.)

    dqn.compile(Adam(), metrics=['mae'])
    weights_filename = 'dqn_snake_weights.h5f'

    if not test:
        if os.path.exists('starting_weights.h5'):
            print('loadin!')
            model.load_weights('starting_weights.h5')
        # Okay, now it's time to learn something! We capture the interrupt exception so that training
        # can be prematurely aborted. Notice that now you can use the built-in Keras callbacks!
        weights_filename = 'dqn_{}_weights.h5f'.format('snake')
        checkpoint_weights_filename = 'dqn_' + 'snake' + '_weights_{step}.h5f'
        log_filename = 'dqn_{}_log.json'.format('snake')
        callbacks = [
            ModelIntervalCheckpoint(checkpoint_weights_filename,
                                    interval=interval)
        ]
        callbacks += [
            ModelIntervalCheckpoint(weights_filename, interval=interval)
        ]
        callbacks += [FileLogger(log_filename, interval=500)]
        callbacks += [WandbLogger(project="snake-rl")]
        dqn.fit(env,
                callbacks=callbacks,
                nb_steps=10000000,
                log_interval=10000,
                visualize=visualize_training,
                nb_max_start_steps=start_steps)

        # After training is done, we save the final weights one more time.
        # dqn.save_weights(weights_filename, overwrite=True)

        # Finally, evaluate our algorithm for 10 episodes.
        # dqn.test(env, nb_episodes=10, visualize=True, nb_max_episode_steps=100)
    else:
        while True:
            try:
                dqn.load_weights(weights_filename)
            except Exception:
                print("weights not found, waiting")
            dqn.test(env,
                     nb_episodes=10,
                     visualize=visualize_training,
                     nb_max_episode_steps=num_max_test)
            time.sleep(3)
コード例 #7
0
def main():
    """Create environment, build models, train."""
    #env = MarketEnv(("ES", "FUT", "GLOBEX", "USD"), obs_xform=xform.Basic(30, 4), episode_steps=STEPS_PER_EPISODE, client_id=3)
    #env = MarketEnv(("EUR", "CASH", "IDEALPRO", "USD"), max_quantity=20000, quantity_increment=20000, obs_xform=xform.Basic(30, 4), episode_steps=STEPS_PER_EPISODE, client_id=5, afterhours=False)
    env = gym.make('trading-v0').env
    env.initialise(symbol='000001',
                   start='2012-01-01',
                   end='2017-01-01',
                   days=252)
    nb_actions = env.action_space.n
    obs_size = np.product(env.observation_space.shape)

    # # Actor model
    # dropout = 0.1
    # actor = Sequential([
    #     Flatten(input_shape=(1,) + env.observation_space.shape),
    #     BatchNormalization(),
    #     Dense(obs_size, activation='relu'),
    #     GaussianDropout(dropout),
    #     BatchNormalization(),
    #     Dense(obs_size, activation='relu'),
    #     GaussianDropout(dropout),
    #     BatchNormalization(),
    #     Dense(obs_size, activation='relu'),
    #     GaussianDropout(dropout),
    #     BatchNormalization(),
    #     Dense(1, activation='tanh'),
    # ])
    # print('Actor model')
    # actor.summary()

    # action_input = Input(shape=(1,), name='action_input')
    # observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input')
    # flattened_observation = Flatten()(observation_input)
    # x = concatenate([action_input, flattened_observation])
    # x = BatchNormalization()(x)
    # x = Dense(obs_size + 1, activation='relu')(x)
    # x = GaussianDropout(dropout)(x)
    # x = Dense(obs_size + 1, activation='relu')(x)
    # x = GaussianDropout(dropout)(x)
    # x = Dense(obs_size + 1, activation='relu')(x)
    # x = GaussianDropout(dropout)(x)
    # x = Dense(obs_size + 1, activation='relu')(x)
    # x = GaussianDropout(dropout)(x)
    # x = Dense(1, activation='linear')(x)
    # critic = Model(inputs=[action_input, observation_input], outputs=x)
    # print('\nCritic Model')
    # critic.summary()

    from keras.models import Sequential
    from keras.layers import Dense, Activation, Flatten
    from keras.optimizers import Adam

    model = Sequential()
    model.add(Flatten(input_shape=(1, ) + env.observation_space.shape))
    model.add(Dense(160))
    model.add(Activation('relu'))
    model.add(Dense(160))
    model.add(Activation('relu'))
    model.add(Dense(160))
    model.add(Activation('relu'))
    model.add(Dense(nb_actions))
    model.add(Activation('linear'))
    print(model.summary())

    memory = SequentialMemory(limit=EPISODES * STEPS_PER_EPISODE,
                              window_length=1)
    random_process = OrnsteinUhlenbeckProcess(theta=.5, mu=0., sigma=.5)
    #agent = DQNAgent(nb_actions=1, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=STEPS_PER_EPISODE * WARMUP_EPISODES, nb_steps_warmup_actor=STEPS_PER_EPISODE * WARMUP_EPISODES, random_process=random_process, gamma=0.95, target_model_update=0.01)
    from rl.policy import BoltzmannQPolicy
    policy = BoltzmannQPolicy()

    agent = DQNAgent(model=model,
                     nb_actions=nb_actions,
                     memory=memory,
                     nb_steps_warmup=10,
                     target_model_update=1e-2,
                     policy=policy)

    agent.compile(Adam(lr=1e-3), metrics=['mae'])
    #weights_filename = 'ddpg_{}_weights.h5f'.format(env.instrument.symbol)
    try:
        #agent.load_weights(weights_filename)
        #print('Using weights from {}'.format(weights_filename))     # DDPGAgent actually uses two separate files for actor and critic derived from this filename
        pass
    except IOError:
        pass
    agent.fit(env,
              nb_steps=EPISODES * STEPS_PER_EPISODE,
              visualize=False,
              verbose=2)
    #agent.save_weights(weights_filename, overwrite=True)
    agent.test(env,
               nb_episodes=5,
               visualize=True,
               nb_max_episode_steps=STEPS_PER_EPISODE)