def main(params=None): """ performs training and evaluation of params :return: model """ if params is None: params = { 'model_type': 'dqn_agent', 'l1_out': 128, 'l2_out': 64, 'gamma': 0.5, 'target_model_update': 1, 'delta_clip': 0.01, 'nb_steps_warmup': 1000 } model_type = 'dqn_agent' env_player = SimpleRLPlayer(battle_format="gen8randombattle") # print('env_player',env_player) # print('help', help(env_player)) env_player2 = SimpleRLPlayer(battle_format="gen8randombattle") opponent = RandomPlayer(battle_format="gen8randombattle") second_opponent = MaxDamagePlayer(battle_format="gen8randombattle") # Output dimension n_action = len(env_player.action_space) # model_params = { # 'n_actions': n_action, # 'l1_out': 128, # 'l2_out': 64, # 'model_type': params['model_type'] # } model_params = params model_params['n_actions'] = n_action model = get_model(model_params) # print('first model summary') # print(model.summary()) # model = Sequential() # model.add(Dense(128, activation="elu", input_shape=(1, 10))) # # # Our embedding have shape (1, 10), which affects our hidden layer # # dimension and output dimension # # Flattening resolve potential issues that would arise otherwise # model.add(Flatten()) # model.add(Dense(64, activation="elu")) # model.add(Dense(n_action, activation="linear")) # elu activation is similar to relu # https://ml-cheatsheet.readthedocs.io/en/latest/activation_functions.html#elu # determine memory type if params['model_type'] in {'dqn_agent', 'sarsa_agent'}: # memory = SequentialMemory(limit=10000, window_length=1) memory = SequentialMemory(limit=NB_TRAINING_STEPS, window_length=1) else: memory = EpisodeParameterMemory(limit=10000, window_length=1) # Simple epsilon greedy # What is linear annealed policy? # - this policy gives gradually decreasing thresholds for the epsilon greedy policy # - it acts as a wrapper around epsilon greedy to feed in a custom threshold pol_steps = NB_TRAINING_STEPS policy = LinearAnnealedPolicy( EpsGreedyQPolicy(), attr="eps", value_max=1.0, value_min=0.05, value_test=0, nb_steps=pol_steps, ) # pol_steps = NB_TRAINING_STEPS policy_boltz = BoltzmannQPolicy(tau=1) # policy = LinearAnnealedPolicy( # BoltzmannQPolicy(), # attr="tau", # value_max=1.0, # value_min=0.05, # value_test=0, # nb_steps=pol_steps, # ) policy = policy_boltz # Defining our DQN # model = tf.keras.models.load_model('dqn_v_dqn') if params['model_type'] == 'dqn_agent': dqn = DQNAgent( model=model, nb_actions=len(env_player.action_space), policy=policy, memory=memory, nb_steps_warmup=params['nb_steps_warmup'], gamma=params['gamma'], target_model_update=params['target_model_update'], # delta_clip=0.01, delta_clip=params['delta_clip'], enable_double_dqn=params['enable_double_dqn__'], enable_dueling_network=params['enable_double_dqn__'], dueling_type=params['dueling_type__']) dqn.compile(Adam(lr=0.00025), metrics=["mae"]) elif params['model_type'] == 'sarsa_agent': dqn = SARSAAgent(model=model, nb_actions=len(env_player.action_space), policy=policy, nb_steps_warmup=params['nb_steps_warmup'], gamma=params['gamma'], delta_clip=params['delta_clip']) dqn.compile(Adam(lr=0.00025), metrics=["mae"]) else: # CEMAgent # https://towardsdatascience.com/cross-entropy-method-for-reinforcement-learning-2b6de2a4f3a0 dqn = CEMAgent(model=model, nb_actions=len(env_player.action_space), memory=memory, nb_steps_warmup=params['nb_steps_warmup']) # different compile function dqn.compile() # dqn.compile(Adam(lr=0.00025), metrics=["mae"]) # opponent dqn dqn_opponent = DQNAgent( model=model, nb_actions=len(env_player.action_space), policy=policy, memory=memory, nb_steps_warmup=params['nb_steps_warmup'], gamma=params['gamma'], target_model_update=params['target_model_update'], # delta_clip=0.01, delta_clip=params['delta_clip'], enable_double_dqn=params['enable_double_dqn__'], enable_dueling_network=params['enable_double_dqn__'], dueling_type=params['dueling_type__']) dqn_opponent.compile(Adam(lr=0.00025), metrics=["mae"]) # NB_TRAINING_STEPS = NB_TRAINING_STEPS # rl_opponent = TrainedRLPlayer(model) # Training rounds = 4 n_steps = NB_TRAINING_STEPS // rounds for k in range(rounds): env_player.play_against( env_algorithm=dqn_training, opponent=opponent, env_algorithm_kwargs={ "dqn": dqn, "nb_steps": n_steps }, ) env_player.play_against( env_algorithm=dqn_training, opponent=second_opponent, env_algorithm_kwargs={ "dqn": dqn, "nb_steps": n_steps }, ) name = params["name"] + "_model" model.save(name) # loaded_model = tf.keras.models.load_model(name) # Evaluation print("Results against random player:") env_player.play_against( env_algorithm=dqn_evaluation, opponent=opponent, env_algorithm_kwargs={ "dqn": dqn, "nb_episodes": NB_EVALUATION_EPISODES }, ) print("\nResults against max player:") env_player.play_against( env_algorithm=dqn_evaluation, opponent=second_opponent, env_algorithm_kwargs={ "dqn": dqn, "nb_episodes": NB_EVALUATION_EPISODES }, ) return model
model = Sequential() model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=5000, window_length=1) policy = BoltzmannQPolicy() dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-3), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. dqn.fit(env, nb_steps=3000, verbose=2) # After training is done, we save the final weights. dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. dqn.test(env, nb_episodes=5)
def build_agent(model, actions): policy = BoltzmannQPolicy() memory = SequentialMemory(limit=50000, window_length=1) dqn = DQNAgent(model=model, memory=memory, policy=policy, nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2) return dqn
def main(unused_argv): try: while True: with sc2_env.SC2Env( map_name="MoveToBeacon", players=[sc2_env.Agent(sc2_env.Race.terran)], agent_interface_format=features.AgentInterfaceFormat( feature_dimensions=features.Dimensions(screen=84, minimap=64), # default size of feature screen and feature minimap use_feature_units=True), step_mul= 64, # 16 gives roughly 150 apm (8 would give 300 apm) # larger num here makes it run faster game_steps_per_episode=0, visualize=True) as env: # create a keras-rl env keras_env = PySC2ToKerasRL_env(env) obs = keras_env.reset() # create an agent that can interact with # Test Agent (makes marine run in circle) # keras_agent = MoveToBeacon_KerasRL() # keras_agent.reset() # while True: #play the game # # step_actions = keras_agent.step(obs) # obs, reward, done, info = keras_env.step(step_actions) # Replace simple agent with a learning one # A simple model (taken from Keras-RL cartpole dqn) nb_actions = keras_env.action_space.n model = Sequential() model.add( Flatten(input_shape=(1, ) + keras_env.observation_space.shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) output_filename = "DQN_Rewards_smallerObs_smallerActions.csv" #some other model # model = Sequential() # model.add(Flatten(input_shape=(1,) + keras_env.observation_space.shape)) # model.add(Dense(16)) # model.add(Activation('relu')) # model.add(Dense(16)) # model.add(Activation('relu')) # model.add(Dense(16)) # model.add(Activation('relu')) # model.add(Dense(nb_actions)) # model.add(Activation('linear')) # print(model.summary()) #output_filename = "DQN Rewards.csv" # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=50000, window_length=1) policy = BoltzmannQPolicy() dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=15, target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-3), metrics=['mae']) # Okay, now it's time to learn something! (hopefully) hist = dqn.fit(keras_env, nb_steps=50000, visualize=False, verbose=2) with open(output_filename, 'w+', newline='') as csvfile: #save the rewards over time writer = csv.writer(csvfile) writer.writerow(hist.history.get('episode_reward')) break #kill the env except KeyboardInterrupt: pass
model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) # In[ ]: model.summary() # In[ ]: memory = SequentialMemory(limit=2000, window_length=1) policy = BoltzmannQPolicy(tau=1.) #dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=100, # target_model_update=1e-2, policy=policy) dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, nb_steps_warmup=1000, gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1.) dqn.compile(Adam(lr=1e-3), metrics=['mae']) # In[ ]: ENV_NAME="aftersubmissionv19" weights_filename = 'dqn_{}_weights.h5f'.format(ENV_NAME) checkpoint_weights_filename = 'dqn_' + ENV_NAME + '_weights_{step}.h5f'
def main(shape=10, winsize=4, test=False, num_max_test=200, visualize_training=False, start_steps=0, randseed=None, human_mode_sleep=0.02): INPUT_SHAPE = (shape, shape) WINDOW_LENGTH = winsize class SnakeProcessor(Processor): def process_observation(self, observation): # assert observation.ndim == 1, str(observation.shape) # (height, width, channel) assert observation.shape == INPUT_SHAPE return observation.astype( 'uint8') # saves storage in experience memory def process_state_batch(self, batch): # We could perform this processing step in `process_observation`. In this case, however, # we would need to store a `float32` array instead, which is 4x more memory intensive than # an `uint8` array. This matters if we store 1M observations. processed_batch = batch.astype('float32') / 255. return processed_batch def process_reward(self, reward): return reward try: randseed = int(randseed) print(f"set seed to {randseed}") except Exception: print(f"failed to intify seed of {randseed}, making it None") randseed = None env = gym.make('snakenv-v0', gs=shape, seed=randseed, human_mode_sleep=human_mode_sleep) np.random.seed(123) env.seed(123) input_shape = (WINDOW_LENGTH, ) + INPUT_SHAPE model = make_model(input_shape, 5) memory = SequentialMemory(limit=100000, window_length=WINDOW_LENGTH) processor = SnakeProcessor() start_policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=0, value_min=0, value_test=0, nb_steps=500000) policy = BoltzmannQPolicy(tau=0.25) interval = 20000 dqn = DQNAgent(model=model, nb_actions=5, policy=policy, memory=memory, processor=processor, nb_steps_warmup=2000, gamma=.99, target_model_update=interval, train_interval=4, delta_clip=1.) dqn.compile(Adam(), metrics=['mae']) weights_filename = 'dqn_snake_weights.h5f' if not test: if os.path.exists('starting_weights.h5'): print('loadin!') model.load_weights('starting_weights.h5') # Okay, now it's time to learn something! We capture the interrupt exception so that training # can be prematurely aborted. Notice that now you can use the built-in Keras callbacks! weights_filename = 'dqn_{}_weights.h5f'.format('snake') checkpoint_weights_filename = 'dqn_' + 'snake' + '_weights_{step}.h5f' log_filename = 'dqn_{}_log.json'.format('snake') callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=interval) ] callbacks += [ ModelIntervalCheckpoint(weights_filename, interval=interval) ] callbacks += [FileLogger(log_filename, interval=500)] callbacks += [WandbLogger(project="snake-rl")] dqn.fit(env, callbacks=callbacks, nb_steps=10000000, log_interval=10000, visualize=visualize_training, nb_max_start_steps=start_steps) # After training is done, we save the final weights one more time. # dqn.save_weights(weights_filename, overwrite=True) # Finally, evaluate our algorithm for 10 episodes. # dqn.test(env, nb_episodes=10, visualize=True, nb_max_episode_steps=100) else: while True: try: dqn.load_weights(weights_filename) except Exception: print("weights not found, waiting") dqn.test(env, nb_episodes=10, visualize=visualize_training, nb_max_episode_steps=num_max_test) time.sleep(3)
def main(): """Create environment, build models, train.""" #env = MarketEnv(("ES", "FUT", "GLOBEX", "USD"), obs_xform=xform.Basic(30, 4), episode_steps=STEPS_PER_EPISODE, client_id=3) #env = MarketEnv(("EUR", "CASH", "IDEALPRO", "USD"), max_quantity=20000, quantity_increment=20000, obs_xform=xform.Basic(30, 4), episode_steps=STEPS_PER_EPISODE, client_id=5, afterhours=False) env = gym.make('trading-v0').env env.initialise(symbol='000001', start='2012-01-01', end='2017-01-01', days=252) nb_actions = env.action_space.n obs_size = np.product(env.observation_space.shape) # # Actor model # dropout = 0.1 # actor = Sequential([ # Flatten(input_shape=(1,) + env.observation_space.shape), # BatchNormalization(), # Dense(obs_size, activation='relu'), # GaussianDropout(dropout), # BatchNormalization(), # Dense(obs_size, activation='relu'), # GaussianDropout(dropout), # BatchNormalization(), # Dense(obs_size, activation='relu'), # GaussianDropout(dropout), # BatchNormalization(), # Dense(1, activation='tanh'), # ]) # print('Actor model') # actor.summary() # action_input = Input(shape=(1,), name='action_input') # observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input') # flattened_observation = Flatten()(observation_input) # x = concatenate([action_input, flattened_observation]) # x = BatchNormalization()(x) # x = Dense(obs_size + 1, activation='relu')(x) # x = GaussianDropout(dropout)(x) # x = Dense(obs_size + 1, activation='relu')(x) # x = GaussianDropout(dropout)(x) # x = Dense(obs_size + 1, activation='relu')(x) # x = GaussianDropout(dropout)(x) # x = Dense(obs_size + 1, activation='relu')(x) # x = GaussianDropout(dropout)(x) # x = Dense(1, activation='linear')(x) # critic = Model(inputs=[action_input, observation_input], outputs=x) # print('\nCritic Model') # critic.summary() from keras.models import Sequential from keras.layers import Dense, Activation, Flatten from keras.optimizers import Adam model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(160)) model.add(Activation('relu')) model.add(Dense(160)) model.add(Activation('relu')) model.add(Dense(160)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) memory = SequentialMemory(limit=EPISODES * STEPS_PER_EPISODE, window_length=1) random_process = OrnsteinUhlenbeckProcess(theta=.5, mu=0., sigma=.5) #agent = DQNAgent(nb_actions=1, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=STEPS_PER_EPISODE * WARMUP_EPISODES, nb_steps_warmup_actor=STEPS_PER_EPISODE * WARMUP_EPISODES, random_process=random_process, gamma=0.95, target_model_update=0.01) from rl.policy import BoltzmannQPolicy policy = BoltzmannQPolicy() agent = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy) agent.compile(Adam(lr=1e-3), metrics=['mae']) #weights_filename = 'ddpg_{}_weights.h5f'.format(env.instrument.symbol) try: #agent.load_weights(weights_filename) #print('Using weights from {}'.format(weights_filename)) # DDPGAgent actually uses two separate files for actor and critic derived from this filename pass except IOError: pass agent.fit(env, nb_steps=EPISODES * STEPS_PER_EPISODE, visualize=False, verbose=2) #agent.save_weights(weights_filename, overwrite=True) agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=STEPS_PER_EPISODE)