def dqn_with_fixed_targets(env, n_episodes=None): # DQN with e-greedy exploration, experience replay, and fixed-Q targets model = build_network(env) target_model = build_network(env) experience = ExperienceReplay(maxlen=2000, sample_batch_size=32, min_size_to_sample=100) decay_sched = ExponentialSchedule(start=1.0, end=0.01, step=0.99) exploration = EpsilonGreedyExploration(decay_sched=decay_sched) fixed_target = FixedQTarget(target_model, target_update_step=500, use_soft_targets=True) agent = DQNAgent(env, model, gamma=0.99, exploration=exploration, experience=experience, fixed_q_target=fixed_target) # Pre-load samples in experience replay. # This can also be done implicitly during regular training episodes, # but the early training may overfit to early samples. experience.bootstrap(env) # Perform the training return train_dqn(agent, n_episodes, debug=n_episodes is None)
def dqn_with_prioritized_experience(env, n_episodes=None): # DQN with e-greedy exploration, prioritized experience replay, and fixed-Q targets sched_step = 1.0 / n_episodes if n_episodes is not None else 0.001 model = build_network(env) target_model = build_network(env) alpha_sched = LinearSchedule(start=0.0, end=1.0, step=sched_step) beta_sched = LinearSchedule(start=0.0, end=1.0, step=sched_step) experience = PrioritizedExperienceReplay(maxlen=10000, sample_batch_size=64, min_size_to_sample=1000, alpha_sched=alpha_sched, beta_sched=beta_sched) decay_sched = ExponentialSchedule(start=1.0, end=0.01, step=0.995) exploration = EpsilonGreedyExploration(decay_sched=decay_sched) fixed_target = FixedQTarget(target_model, target_update_step=500, use_soft_targets=True, use_double_q=True) agent = DQNAgent(env, model, gamma=0.99, exploration=exploration, experience=experience, fixed_q_target=fixed_target) # Pre-load samples in experience replay. # This can also be done implicitly during regular training episodes, # but the early training may overfit to early samples. experience.bootstrap(env) # Perform the training return train_dqn(agent, n_episodes)
def run(algo): env = gym.make('Frostbite-ram-v0') if algo == 'sarsa': agent = SarsaAgent(env.observation_space, env.action_space, epsilon=0.1, alpha=0.01, gamma=0.1) SarsaExperiment().run(agent, env, 100000) elif algo == 'reinforce': agent = REINFORCEAgent(observation_space=env.observation_space, actions_space=env.action_space, learning_rate=0.001, gamma=0.99, hidden1=128, hidden2=18, hidden3=18) ReinforceExperiment(env, agent, stop_criterion=10000, EPISODES=100000).run() elif algo == 'dqn': agent = DQNAgent(gym.spaces.Discrete(10), obs_size=env.observation_space, epsilon=1, epoch_length=50, nhidden=202, learning_rate=0.0002, gamma=0.945, tau=0.75) DQNExperiment().run_qlearning(env, agent, 100000, True) elif algo == 'watch dqn': agent = DQNWatchAgent(env.action_space, obs_size=env.observation_space, epsilon=0.01, epoch_length=100, nhidden=256, learning_rate=0.0001, gamma=0.9, tau=0.1) DQNWatchExperiment().run_qlearning(env, agent, 100000, True) elif algo == 'actorCritic': agent = ActorCriticAgent( observation_space=env.observation_space, actions_space=env.action_space, alpha=0.0001, beta=0.0001, gamma=0.9, hidden1=18, hidden2=150, ) # agent = ActorCriticAgent( # observation_space=env.observation_space, # actions_space=env.action_space, # alpha = 0.0001, # beta = 0.0001, # gamma = 0.9995, # hidden1 = 128, # hidden2 = 72, # ) ActorCriticExperiment(env, agent, EPISODES=100000).run_actorcritic()
def basic_dqn(env, n_episodes): # Basic DQN with e-greedy exploration model = build_network(env) decay_sched = ExponentialSchedule(start=1.0, end=0.01, step=0.99) exploration = EpsilonGreedyExploration(decay_sched=decay_sched) agent = DQNAgent(env, model, gamma=0.99, exploration=exploration) # Perform the training return train_dqn(agent, n_episodes, debug=True)
def __init__(self): game, model, render, episode_limit, batch_size, target_score, test_model = self._args( ) self.env = gym.make(game) self.render = render self.episode_limit = episode_limit self.batch_size = batch_size self.target_score = target_score self.observation_space = self.env.observation_space.shape[0] self.action_space = self.env.action_space.n self.agent = DQNAgent(game, self.observation_space, self.action_space) self.save_name = str(game) + '_' + str( model.lower()) + '/' + str(game) + '_' + str(model.lower()) if model.lower() == 'dqn': self.agent = DQNAgent(game, self.observation_space, self.action_space) elif model.lower() == 'ddqn': self.agent = DDQNAgent(game, self.observation_space, self.action_space) elif model.lower() == 'duelingddqn': self.agent = DuelingDDQNAgent(game, self.observation_space, self.action_space) elif model.lower() == 'perddqn': self.agent = PERDDQNAgent(game, self.observation_space, self.action_space) elif model.lower() == 'test': self.agent = TestAgent(game, self.observation_space, self.action_space) self.history = [('episode', 'score', 'average_score', 'steps', 'total_steps')] if test_model: self.agent.load_model(test_model) self.test() else: # make a directory to hold the saved files from this run if it doesn't exist try: os.mkdir(str(game) + '_' + str(model.lower())) except FileExistsError: pass self.train()
def dqn_with_experience(env, n_episodes): # DQN with e-greedy exploration and experience replay model = build_network(env) experience = ExperienceReplay(maxlen=10000, sample_batch_size=64, min_size_to_sample=1000) decay_sched = ExponentialSchedule(start=1.0, end=0.01, step=0.995) exploration = EpsilonGreedyExploration(decay_sched=decay_sched) agent = DQNAgent(env, model, gamma=0.99, exploration=exploration, experience=experience) # Pre-load samples in experience replay. # This can also be done implicitly during regular training episodes, # but the early training may overfit to early samples. experience.bootstrap(env) # Perform the training return train_dqn(agent, n_episodes, debug=True)
""" dqn.py Test script for Square Stacker Deep Q Network Agent """ from agents.dqn import DQNAgent from tests.agent import test_agent # Test Settings num_fits = 20 games_per_fit = 5000 discount = 0.0 epsilon = 0.1 test_num_games = 1000 # Test Agent agent = DQNAgent() agent.train(num_fits, games_per_fit, discount, epsilon) test_agent(agent, num_games=test_num_games)
gamma = 0.9 # Discount factor epsilon = 0.1 # choosing random action imb_ratio = 0.1 # Imbalance ratio min_class = [2] # Minority classes maj_class = [0, 1, 3, 4, 5, 6, 7, 8, 9] # Majority classes X_train, y_train, X_test, y_test = load_image("mnist") X_train, y_train, X_val, y_val, X_test, y_test = create_data(X_train, y_train, X_test, y_test, min_class, maj_class, imb_ratio=imb_ratio) print('Distribution after imbalancing (training): {}'.format(Counter(y_train))) print('Distribution after imbalancing (validation): {}'.format(Counter(y_val))) collect_steps_per_episode = 50 conv_layers = ((32, (5, 5), 2), (32, (5, 5), 2), ) # Convolutional layers dense_layers = (256, 256,) # Dense layers dropout_layers = (0.2, 0.2,) # Dropout layers layers = {"conv": conv_layers, "dense": dense_layers, "dropout": dropout_layers} # build a dict containing the underlying Q-Network Layers model = DQNAgent() model.compile(X_train, y_train, learning_rate, epsilon, gamma, imb_ratio, replay_buffer_max_length, layers) model.fit(X_train, y_train, epochs=episodes, batch_size=batch_size, eval_step=eval_step, log_step=log_step, collect_steps_per_episode=collect_steps_per_episode) model.evaluate(X_test, y_test, X_train, y_train)
import gym import gym_ple import torch.optim as optim import torch.nn as nn from agents.dqn import DQNAgent from models.cnn import CNNModel, DuelingCNNModel from environment import Environment lr = 0.00001 momentum = 0.95 num_episodes = 1000000000 batch_size = 32 env = Environment('FlappyBird-v0') model = DuelingCNNModel(env.action_space()) optimizer = optim.RMSprop(params=model.parameters(), lr=lr, momentum=momentum) loss = nn.SmoothL1Loss() agent = DQNAgent(environment=env, model=model, optimizer=optimizer, loss=loss) agent.train(num_episodes=num_episodes, batch_size=batch_size, verbose=True)
def run(self, num_runs, training, num_human, actual_num_human, num_cav, model, debug, num_merge_0=None, num_merge_1=None): model_name = model + '_hv_' + str(num_human) + '_cav_' + str(num_cav) if debug: nb_steps_warmup = 30 batch_size = 10 total_steps = 200 log_interval = 40 nb_max_episode_steps = 20 gamma = 0.99 else: nb_steps_warmup = 200000 batch_size = 32 total_steps = 800000 log_interval = 4000 nb_max_episode_steps = 2500 gamma = 0.99 F = 2 + self.env.net_params.additional_params[ 'highway_lanes'] + self.env.n_unique_intentions # input feature size N = num_human + num_cav A = 3 from gym.spaces.box import Box from gym.spaces import Discrete from gym.spaces.dict import Dict states = Box(low=-np.inf, high=np.inf, shape=(N, F), dtype=np.float32) adjacency = Box(low=0, high=1, shape=(N, N), dtype=np.int32) mask = Box(low=0, high=1, shape=(N, ), dtype=np.int32) obs_space = Dict({ 'states': states, 'adjacency': adjacency, 'mask': mask }) act_space = Box(low=0, high=1, shape=(N, ), dtype=np.int32) from graph_model import GraphicQNetworkKeras, LstmQNetworkKeras, GraphicQNetworkKeras2 from agents.memory import CustomerSequentialMemory from agents.processor import Jiqian_MultiInputProcessor from agents.dqn import DQNAgent from agents.policy import eps_greedy_q_policy, greedy_q_policy, random_obs_policy from spektral.layers import GraphConv from tensorflow.keras.optimizers import Adam import tensorflow as tf memory_buffer = CustomerSequentialMemory(limit=100000, window_length=1) multi_input_processor = Jiqian_MultiInputProcessor(A) if model == 'gcn': rl_model = GraphicQNetworkKeras2(N, F, obs_space, act_space) elif model == 'lstm': rl_model = LstmQNetworkKeras(N, F, obs_space, act_space) else: raise NotImplementedError my_dqn = DQNAgent(processor=multi_input_processor, model=rl_model.base_model, policy=eps_greedy_q_policy(0.3), test_policy=greedy_q_policy(), start_policy=random_obs_policy(), nb_total_agents=N, nb_actions=A, memory=memory_buffer, nb_steps_warmup=nb_steps_warmup, batch_size=batch_size, gamma=gamma, custom_model_objects={'GraphConv': GraphConv}) my_dqn.compile(Adam(0.001)) if training: logdir = "./logs/" history_file = "./logs/" + model_name + '_training_hist.txt' try: # os.rmdir(logdir) os.remove(history_file) except: pass from agents.rl_lib.callbacks import FileLogger # from tensorflow.python.keras.callbacks import TensorBoard # tensorboard_callback = TensorBoard(log_dir=logdir,histogram_freq=1,write_graph=True,update_freq='batch') file_log = FileLogger(history_file) history = my_dqn.fit(self.env, nb_steps=total_steps, nb_max_episode_steps=nb_max_episode_steps, visualize=False, verbose=1, log_interval=log_interval, callbacks=[file_log]) my_dqn.save_weights('./models/dqn_{}.h5f'.format(model_name), overwrite=True) from generate_training_plots import plot_training plot_training(logdir) else: if num_merge_0 is not None: history_file = "./logs/test/vary_ramp_popularity/{}_cav0_{}_cav1_{}_hv_{}_testing_hist.txt".format( model, num_merge_0, num_merge_1, actual_num_human) else: history_file = "./logs/test/{}_cav_{}_hv_{}_testing_hist.txt".format( model, num_cav, actual_num_human) my_dqn.load_weights('./models/dqn_{}.h5f'.format(model_name)) print("succssfully loaded") hist = my_dqn.test(self.env, nb_episodes=num_runs) # print(hist.history) with open(history_file, 'w') as f: json.dump(hist.history, f)
from agents.dqn.DQNAgent import * env = gym.make("CartPole-v0") print("Observation space: {}".format(env.observation_space)) print("Action space: {}".format(env.action_space)) nb_actions = env.action_space.n observation_shape = env.observation_space.shape train_policy = DecayEpsGreedyQPolicy(eps_min=0, eps_decay=0.99) agent = DQNAgent( action_space=env.action_space, observation_space=env.observation_space, train_policy=train_policy, dueling_type='max' ) print("Start training~") for episode in range(200): episode_rewards = 0 observation = env.reset() for step in range(200): action = agent.forward(observation) next_observation, reward, terminal, _ = env.step(action) agent.backward(observation, action, reward, terminal, next_observation) episode_rewards += reward
import gym import numpy as np import matplotlib.pyplot as plt from datetime import datetime import logging logging.basicConfig(format='%(asctime)s %(message)s', level=logging.DEBUG) import tensorflow as tf from agents.dqn import DQNAgent, DQNBuffer # instantiate env env = gym.make('LunarLander-v2') dim_obs = env.observation_space.shape num_act = env.action_space.n # instantiate agent and replay buffer agent = DQNAgent() replay_buffer = DQNBuffer(dim_obs=dim_obs[0], size=int(1e6)) save_dir = './saved_models/' + env.spec.id + '/dqn/' + datetime.now().strftime( "%Y-%m-%d-%H-%M") + '/' if not os.path.exists(save_dir): try: os.makedirs(save_dir) except OSError as e: if e.errno != errno.EEXIST: raise value_net_path = os.path.join(save_dir, 'value_net') RANDOM_SEED = 0 tf.random.set_seed(RANDOM_SEED) env.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) env.action_space.seed(RANDOM_SEED)
env_params = { 'charge': 20, 'charge_reward': -0.1, 'crash_reward': -1, 'delivery_reward': 1, 'discharge': 10, 'drone_density': 0.05, 'dropzones_factor': 2, 'packets_factor': 3, 'pickup_reward': 0, 'skyscrapers_factor': 3, 'stations_factor': 2, "n_drones": NB_AGENTS, "rgb_render_rescale": 2.0 } env = WindowedGridView(DeliveryDrones(env_params), radius=3) agent = DQNAgent(env, DenseQNetworkFactory(env, hidden_layers=[256, 256]), gamma=0.95, epsilon_start=1.0, epsilon_decay=0.999, epsilon_end=0.01, memory_size=10000, batch_size=64, target_update_interval=5) agent.reset() agent.save("baseline_models/random-agent-{}.pt".format(_idx))
num_episodes = 5000 # 5000 stats = EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes), episode_safety=np.zeros(num_episodes), episode_confidence=np.zeros(num_episodes)) tf.compat.v1.reset_default_graph() with tf.Session() as sess: agent = DQNAgent(sess, world_shape, int(actions_num), env, frames_state=frames_state, experiment_dir=modelLocation, replay_memory_size=20000, # 10000 replay_memory_init_size=3000, # 3000 update_target_estimator_every=1000, # 500 discount_factor=0.99, epsilon_start=epsilon_start, epsilon_end=epsilon_end, epsilon_decay_steps=250000, batch_size=batch_size, worldSize=worldSize) for i_episode in range(num_episodes): # Save the current checkpoint if doTraining: agent.save() else: break ret = 0 time_step = env.reset(np.random.choice([1,2,3,4,5])) # for the description of timestep see ai_safety_gridworlds.environments.shared.rl.environment
import gym import gym_ple from agents.dqn import DQNAgent from models.cnn import DuelingCNNModel from environment import Environment import torch env = Environment('FlappyBird-v0') model = DuelingCNNModel(env.action_space()) agent = DQNAgent(environment=env, model=model) agent.play()
env = gym.make('CartPole-v0') # ゲームを指定して読み込む np.random.seed(123) env.seed(123) nb_actions = env.action_space.n actions = np.arange(nb_actions) policy = EpsGreedyQPolicy(eps=1.0, eps_decay_rate=0.99, min_eps=0.01) memory = Memory(limit=50000, maxlen=1) # 初期観測情報 obs = env.reset() # エージェントの初期化 agent = DQNAgent(actions=actions, memory=memory, update_interval=200, train_interval=1, batch_size=32, memory_interval=1, observation=obs, input_shape=[len(obs)], id=1, name=None, training=True, policy=policy) agent.compile() result = [] nb_epsiodes = 500 # エピソード数 for episode in range(nb_epsiodes): agent.reset() observation = env.reset() # 環境の初期化 observation = deepcopy(observation) agent.observe(observation) done = False
import gym import torch from agents.dqn import DQNAgent from common.wrappers import make_atari, wrap_deepmind, wrap_pytorch from common.utils import mini_batch_train_frames env_id = "PongNoFrameskip-v4" env = make_atari(env_id) env = wrap_deepmind(env) env = wrap_pytorch(env) MAX_FRAMES = 1000000 BATCH_SIZE = 32 agent = DQNAgent(env, use_conv=True) if torch.cuda.is_available(): agent.model.cuda() episode_rewards = mini_batch_train_frames(env, agent, MAX_FRAMES, BATCH_SIZE)
# -*- coding: utf-8 -*- import numpy as np from turtlebot_env.basic_env import environ from agents.dqn import DQNAgent import argparse def arguments(): return if __name__ == '__main__': env = environ() agent = DQNAgent(env, args) agent.train()
environmentVectorSize=10 + (teamSize -1)*11 +3 * opponentsSize print environmentVectorSize # Next, we build a very simple model. # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=50000, window_length=1) policy = BoltzmannQPolicy() import Queue as Q agents=[] q=Q.Queue() u2=team[:] for i in team: d= DQNAgent( i,u2,opponents,actions,actionsEnum,inputV=environmentVectorSize, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,target_model_update=1e-2, policy=policy, ) d.compile(Adam(lr=1e-3), metrics=['mae']) agents.append(d) agentContainers=[] for i in agents: agentContainers.append(AgentContainer(i,i.id,teamSize,opponentsSize,q,[],[])) startPool(agentContainers) # Join Environment thread env.end()