Ejemplo n.º 1
0
def dqn_with_fixed_targets(env, n_episodes=None):
    # DQN with e-greedy exploration, experience replay, and fixed-Q targets
    model = build_network(env)
    target_model = build_network(env)
    experience = ExperienceReplay(maxlen=2000,
                                  sample_batch_size=32,
                                  min_size_to_sample=100)
    decay_sched = ExponentialSchedule(start=1.0, end=0.01, step=0.99)
    exploration = EpsilonGreedyExploration(decay_sched=decay_sched)
    fixed_target = FixedQTarget(target_model,
                                target_update_step=500,
                                use_soft_targets=True)
    agent = DQNAgent(env,
                     model,
                     gamma=0.99,
                     exploration=exploration,
                     experience=experience,
                     fixed_q_target=fixed_target)

    # Pre-load samples in experience replay.
    # This can also be done implicitly during regular training episodes,
    # but the early training may overfit to early samples.
    experience.bootstrap(env)

    # Perform the training
    return train_dqn(agent, n_episodes, debug=n_episodes is None)
Ejemplo n.º 2
0
def dqn_with_prioritized_experience(env, n_episodes=None):
    # DQN with e-greedy exploration, prioritized experience replay, and fixed-Q targets
    sched_step = 1.0 / n_episodes if n_episodes is not None else 0.001

    model = build_network(env)
    target_model = build_network(env)
    alpha_sched = LinearSchedule(start=0.0, end=1.0, step=sched_step)
    beta_sched = LinearSchedule(start=0.0, end=1.0, step=sched_step)
    experience = PrioritizedExperienceReplay(maxlen=10000,
                                             sample_batch_size=64,
                                             min_size_to_sample=1000,
                                             alpha_sched=alpha_sched,
                                             beta_sched=beta_sched)
    decay_sched = ExponentialSchedule(start=1.0, end=0.01, step=0.995)
    exploration = EpsilonGreedyExploration(decay_sched=decay_sched)
    fixed_target = FixedQTarget(target_model,
                                target_update_step=500,
                                use_soft_targets=True,
                                use_double_q=True)
    agent = DQNAgent(env,
                     model,
                     gamma=0.99,
                     exploration=exploration,
                     experience=experience,
                     fixed_q_target=fixed_target)

    # Pre-load samples in experience replay.
    # This can also be done implicitly during regular training episodes,
    # but the early training may overfit to early samples.
    experience.bootstrap(env)

    # Perform the training
    return train_dqn(agent, n_episodes)
Ejemplo n.º 3
0
def run(algo):
    env = gym.make('Frostbite-ram-v0')

    if algo == 'sarsa':
        agent = SarsaAgent(env.observation_space,
                           env.action_space,
                           epsilon=0.1,
                           alpha=0.01,
                           gamma=0.1)
        SarsaExperiment().run(agent, env, 100000)
    elif algo == 'reinforce':
        agent = REINFORCEAgent(observation_space=env.observation_space,
                               actions_space=env.action_space,
                               learning_rate=0.001,
                               gamma=0.99,
                               hidden1=128,
                               hidden2=18,
                               hidden3=18)
        ReinforceExperiment(env, agent, stop_criterion=10000,
                            EPISODES=100000).run()
    elif algo == 'dqn':
        agent = DQNAgent(gym.spaces.Discrete(10),
                         obs_size=env.observation_space,
                         epsilon=1,
                         epoch_length=50,
                         nhidden=202,
                         learning_rate=0.0002,
                         gamma=0.945,
                         tau=0.75)
        DQNExperiment().run_qlearning(env, agent, 100000, True)
    elif algo == 'watch dqn':
        agent = DQNWatchAgent(env.action_space,
                              obs_size=env.observation_space,
                              epsilon=0.01,
                              epoch_length=100,
                              nhidden=256,
                              learning_rate=0.0001,
                              gamma=0.9,
                              tau=0.1)
        DQNWatchExperiment().run_qlearning(env, agent, 100000, True)
    elif algo == 'actorCritic':
        agent = ActorCriticAgent(
            observation_space=env.observation_space,
            actions_space=env.action_space,
            alpha=0.0001,
            beta=0.0001,
            gamma=0.9,
            hidden1=18,
            hidden2=150,
        )
        # agent = ActorCriticAgent(
        #     observation_space=env.observation_space,
        #     actions_space=env.action_space,
        #     alpha = 0.0001,
        #     beta = 0.0001,
        #     gamma = 0.9995,
        #     hidden1 = 128,
        #     hidden2 = 72,
        # )
        ActorCriticExperiment(env, agent, EPISODES=100000).run_actorcritic()
Ejemplo n.º 4
0
def basic_dqn(env, n_episodes):
    # Basic DQN with e-greedy exploration
    model = build_network(env)
    decay_sched = ExponentialSchedule(start=1.0, end=0.01, step=0.99)
    exploration = EpsilonGreedyExploration(decay_sched=decay_sched)
    agent = DQNAgent(env, model, gamma=0.99, exploration=exploration)

    # Perform the training
    return train_dqn(agent, n_episodes, debug=True)
 def __init__(self):
     game, model, render, episode_limit, batch_size, target_score, test_model = self._args(
     )
     self.env = gym.make(game)
     self.render = render
     self.episode_limit = episode_limit
     self.batch_size = batch_size
     self.target_score = target_score
     self.observation_space = self.env.observation_space.shape[0]
     self.action_space = self.env.action_space.n
     self.agent = DQNAgent(game, self.observation_space, self.action_space)
     self.save_name = str(game) + '_' + str(
         model.lower()) + '/' + str(game) + '_' + str(model.lower())
     if model.lower() == 'dqn':
         self.agent = DQNAgent(game, self.observation_space,
                               self.action_space)
     elif model.lower() == 'ddqn':
         self.agent = DDQNAgent(game, self.observation_space,
                                self.action_space)
     elif model.lower() == 'duelingddqn':
         self.agent = DuelingDDQNAgent(game, self.observation_space,
                                       self.action_space)
     elif model.lower() == 'perddqn':
         self.agent = PERDDQNAgent(game, self.observation_space,
                                   self.action_space)
     elif model.lower() == 'test':
         self.agent = TestAgent(game, self.observation_space,
                                self.action_space)
     self.history = [('episode', 'score', 'average_score', 'steps',
                      'total_steps')]
     if test_model:
         self.agent.load_model(test_model)
         self.test()
     else:
         # make a directory to hold the saved files from this run if it doesn't exist
         try:
             os.mkdir(str(game) + '_' + str(model.lower()))
         except FileExistsError:
             pass
         self.train()
Ejemplo n.º 6
0
def dqn_with_experience(env, n_episodes):
    # DQN with e-greedy exploration and experience replay
    model = build_network(env)
    experience = ExperienceReplay(maxlen=10000,
                                  sample_batch_size=64,
                                  min_size_to_sample=1000)
    decay_sched = ExponentialSchedule(start=1.0, end=0.01, step=0.995)
    exploration = EpsilonGreedyExploration(decay_sched=decay_sched)
    agent = DQNAgent(env,
                     model,
                     gamma=0.99,
                     exploration=exploration,
                     experience=experience)

    # Pre-load samples in experience replay.
    # This can also be done implicitly during regular training episodes,
    # but the early training may overfit to early samples.
    experience.bootstrap(env)

    # Perform the training
    return train_dqn(agent, n_episodes, debug=True)
Ejemplo n.º 7
0
"""
dqn.py
Test script for Square Stacker Deep Q Network Agent
"""

from agents.dqn import DQNAgent
from tests.agent import test_agent

# Test Settings
num_fits = 20
games_per_fit = 5000
discount = 0.0
epsilon = 0.1
test_num_games = 1000

# Test Agent
agent = DQNAgent()
agent.train(num_fits, games_per_fit, discount, epsilon)
test_agent(agent, num_games=test_num_games)
Ejemplo n.º 8
0
gamma = 0.9  # Discount factor
epsilon = 0.1  # choosing random action
imb_ratio = 0.1  # Imbalance ratio

min_class = [2]  # Minority classes
maj_class = [0, 1, 3, 4, 5, 6, 7, 8, 9]  # Majority classes

X_train, y_train, X_test, y_test = load_image("mnist")

X_train, y_train, X_val, y_val, X_test, y_test = create_data(X_train, y_train, X_test, y_test, min_class, maj_class, imb_ratio=imb_ratio)

print('Distribution after imbalancing (training): {}'.format(Counter(y_train)))
print('Distribution after imbalancing (validation): {}'.format(Counter(y_val)))

collect_steps_per_episode = 50

conv_layers  = ((32, (5, 5), 2), (32, (5, 5), 2), )  # Convolutional layers
dense_layers = (256, 256,)  # Dense layers
dropout_layers = (0.2, 0.2,)  # Dropout layers
layers = {"conv": conv_layers, "dense": dense_layers, "dropout": dropout_layers}  # build a dict containing the underlying Q-Network Layers

model = DQNAgent()

model.compile(X_train, y_train, learning_rate, epsilon, gamma, imb_ratio, replay_buffer_max_length, layers)

model.fit(X_train, y_train, epochs=episodes, batch_size=batch_size, eval_step=eval_step, log_step=log_step,
          collect_steps_per_episode=collect_steps_per_episode)

model.evaluate(X_test, y_test, X_train, y_train)

Ejemplo n.º 9
0
import gym
import gym_ple

import torch.optim as optim
import torch.nn as nn

from agents.dqn import DQNAgent
from models.cnn import CNNModel, DuelingCNNModel
from environment import Environment

lr = 0.00001
momentum = 0.95
num_episodes = 1000000000
batch_size = 32

env = Environment('FlappyBird-v0')
model = DuelingCNNModel(env.action_space())
optimizer = optim.RMSprop(params=model.parameters(), lr=lr, momentum=momentum)
loss = nn.SmoothL1Loss()
agent = DQNAgent(environment=env, model=model, optimizer=optimizer, loss=loss)

agent.train(num_episodes=num_episodes, batch_size=batch_size, verbose=True)
Ejemplo n.º 10
0
    def run(self,
            num_runs,
            training,
            num_human,
            actual_num_human,
            num_cav,
            model,
            debug,
            num_merge_0=None,
            num_merge_1=None):
        model_name = model + '_hv_' + str(num_human) + '_cav_' + str(num_cav)

        if debug:
            nb_steps_warmup = 30
            batch_size = 10
            total_steps = 200
            log_interval = 40
            nb_max_episode_steps = 20
            gamma = 0.99
        else:
            nb_steps_warmup = 200000
            batch_size = 32
            total_steps = 800000
            log_interval = 4000
            nb_max_episode_steps = 2500
            gamma = 0.99

        F = 2 + self.env.net_params.additional_params[
            'highway_lanes'] + self.env.n_unique_intentions  # input feature size
        N = num_human + num_cav
        A = 3

        from gym.spaces.box import Box
        from gym.spaces import Discrete
        from gym.spaces.dict import Dict
        states = Box(low=-np.inf, high=np.inf, shape=(N, F), dtype=np.float32)
        adjacency = Box(low=0, high=1, shape=(N, N), dtype=np.int32)
        mask = Box(low=0, high=1, shape=(N, ), dtype=np.int32)

        obs_space = Dict({
            'states': states,
            'adjacency': adjacency,
            'mask': mask
        })
        act_space = Box(low=0, high=1, shape=(N, ), dtype=np.int32)

        from graph_model import GraphicQNetworkKeras, LstmQNetworkKeras, GraphicQNetworkKeras2
        from agents.memory import CustomerSequentialMemory
        from agents.processor import Jiqian_MultiInputProcessor
        from agents.dqn import DQNAgent
        from agents.policy import eps_greedy_q_policy, greedy_q_policy, random_obs_policy
        from spektral.layers import GraphConv
        from tensorflow.keras.optimizers import Adam
        import tensorflow as tf

        memory_buffer = CustomerSequentialMemory(limit=100000, window_length=1)
        multi_input_processor = Jiqian_MultiInputProcessor(A)

        if model == 'gcn':
            rl_model = GraphicQNetworkKeras2(N, F, obs_space, act_space)
        elif model == 'lstm':
            rl_model = LstmQNetworkKeras(N, F, obs_space, act_space)
        else:
            raise NotImplementedError

        my_dqn = DQNAgent(processor=multi_input_processor,
                          model=rl_model.base_model,
                          policy=eps_greedy_q_policy(0.3),
                          test_policy=greedy_q_policy(),
                          start_policy=random_obs_policy(),
                          nb_total_agents=N,
                          nb_actions=A,
                          memory=memory_buffer,
                          nb_steps_warmup=nb_steps_warmup,
                          batch_size=batch_size,
                          gamma=gamma,
                          custom_model_objects={'GraphConv': GraphConv})

        my_dqn.compile(Adam(0.001))

        if training:

            logdir = "./logs/"
            history_file = "./logs/" + model_name + '_training_hist.txt'
            try:
                # os.rmdir(logdir)
                os.remove(history_file)

            except:
                pass

            from agents.rl_lib.callbacks import FileLogger

            # from tensorflow.python.keras.callbacks import TensorBoard
            # tensorboard_callback = TensorBoard(log_dir=logdir,histogram_freq=1,write_graph=True,update_freq='batch')

            file_log = FileLogger(history_file)
            history = my_dqn.fit(self.env,
                                 nb_steps=total_steps,
                                 nb_max_episode_steps=nb_max_episode_steps,
                                 visualize=False,
                                 verbose=1,
                                 log_interval=log_interval,
                                 callbacks=[file_log])
            my_dqn.save_weights('./models/dqn_{}.h5f'.format(model_name),
                                overwrite=True)

            from generate_training_plots import plot_training

            plot_training(logdir)

        else:
            if num_merge_0 is not None:
                history_file = "./logs/test/vary_ramp_popularity/{}_cav0_{}_cav1_{}_hv_{}_testing_hist.txt".format(
                    model, num_merge_0, num_merge_1, actual_num_human)
            else:
                history_file = "./logs/test/{}_cav_{}_hv_{}_testing_hist.txt".format(
                    model, num_cav, actual_num_human)

            my_dqn.load_weights('./models/dqn_{}.h5f'.format(model_name))
            print("succssfully loaded")
            hist = my_dqn.test(self.env, nb_episodes=num_runs)
            # print(hist.history)

            with open(history_file, 'w') as f:
                json.dump(hist.history, f)
Ejemplo n.º 11
0
from agents.dqn.DQNAgent import *

env = gym.make("CartPole-v0")

print("Observation space: {}".format(env.observation_space))
print("Action space: {}".format(env.action_space))

nb_actions = env.action_space.n
observation_shape = env.observation_space.shape

train_policy = DecayEpsGreedyQPolicy(eps_min=0, eps_decay=0.99)

agent = DQNAgent(
    action_space=env.action_space,
    observation_space=env.observation_space,
    train_policy=train_policy,
    dueling_type='max'
)

print("Start training~")
for episode in range(200):
    episode_rewards = 0
    observation = env.reset()
    for step in range(200):
        action = agent.forward(observation)
        
        next_observation, reward, terminal, _ = env.step(action)
        
        agent.backward(observation, action, reward, terminal, next_observation)
        episode_rewards += reward
        
Ejemplo n.º 12
0
import gym
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import logging
logging.basicConfig(format='%(asctime)s %(message)s', level=logging.DEBUG)

import tensorflow as tf
from agents.dqn import DQNAgent, DQNBuffer

# instantiate env
env = gym.make('LunarLander-v2')
dim_obs = env.observation_space.shape
num_act = env.action_space.n
# instantiate agent and replay buffer
agent = DQNAgent()
replay_buffer = DQNBuffer(dim_obs=dim_obs[0], size=int(1e6))
save_dir = './saved_models/' + env.spec.id + '/dqn/' + datetime.now().strftime(
    "%Y-%m-%d-%H-%M") + '/'
if not os.path.exists(save_dir):
    try:
        os.makedirs(save_dir)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise
value_net_path = os.path.join(save_dir, 'value_net')
RANDOM_SEED = 0
tf.random.set_seed(RANDOM_SEED)
env.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
env.action_space.seed(RANDOM_SEED)
    env_params = {
        'charge': 20,
        'charge_reward': -0.1,
        'crash_reward': -1,
        'delivery_reward': 1,
        'discharge': 10,
        'drone_density': 0.05,
        'dropzones_factor': 2,
        'packets_factor': 3,
        'pickup_reward': 0,
        'skyscrapers_factor': 3,
        'stations_factor': 2,
        "n_drones": NB_AGENTS,
        "rgb_render_rescale": 2.0
    }

    env = WindowedGridView(DeliveryDrones(env_params), radius=3)

    agent = DQNAgent(env,
                     DenseQNetworkFactory(env, hidden_layers=[256, 256]),
                     gamma=0.95,
                     epsilon_start=1.0,
                     epsilon_decay=0.999,
                     epsilon_end=0.01,
                     memory_size=10000,
                     batch_size=64,
                     target_update_interval=5)

    agent.reset()
    agent.save("baseline_models/random-agent-{}.pt".format(_idx))
Ejemplo n.º 14
0
num_episodes = 5000  # 5000
stats = EpisodeStats(episode_lengths=np.zeros(num_episodes),
                     episode_rewards=np.zeros(num_episodes),
                     episode_safety=np.zeros(num_episodes),
                     episode_confidence=np.zeros(num_episodes))

tf.compat.v1.reset_default_graph()
with tf.Session() as sess:
    agent = DQNAgent(sess,
                 world_shape,
                 int(actions_num),
                 env,
                 frames_state=frames_state,
                 experiment_dir=modelLocation,
                 replay_memory_size=20000,  # 10000
                 replay_memory_init_size=3000,  # 3000
                 update_target_estimator_every=1000,  # 500
                 discount_factor=0.99,
                 epsilon_start=epsilon_start,
                 epsilon_end=epsilon_end,
                     epsilon_decay_steps=250000,
                     batch_size=batch_size, worldSize=worldSize)
    
    for i_episode in range(num_episodes):
        # Save the current checkpoint
        if doTraining:
            agent.save()
        else:
           break
        ret = 0
        time_step = env.reset(np.random.choice([1,2,3,4,5]))  # for the description of timestep see ai_safety_gridworlds.environments.shared.rl.environment
Ejemplo n.º 15
0
import gym
import gym_ple

from agents.dqn import DQNAgent
from models.cnn import DuelingCNNModel
from environment import Environment

import torch

env = Environment('FlappyBird-v0')
model = DuelingCNNModel(env.action_space())
agent = DQNAgent(environment=env, model=model)

agent.play()
    env = gym.make('CartPole-v0')  # ゲームを指定して読み込む
    np.random.seed(123)
    env.seed(123)
    nb_actions = env.action_space.n
    actions = np.arange(nb_actions)
    policy = EpsGreedyQPolicy(eps=1.0, eps_decay_rate=0.99, min_eps=0.01)
    memory = Memory(limit=50000, maxlen=1)
    # 初期観測情報
    obs = env.reset()
    # エージェントの初期化
    agent = DQNAgent(actions=actions,
                     memory=memory,
                     update_interval=200,
                     train_interval=1,
                     batch_size=32,
                     memory_interval=1,
                     observation=obs,
                     input_shape=[len(obs)],
                     id=1,
                     name=None,
                     training=True,
                     policy=policy)
    agent.compile()

    result = []
    nb_epsiodes = 500  # エピソード数
    for episode in range(nb_epsiodes):
        agent.reset()
        observation = env.reset()  # 環境の初期化
        observation = deepcopy(observation)
        agent.observe(observation)
        done = False
Ejemplo n.º 17
0
import gym
import torch

from agents.dqn import DQNAgent
from common.wrappers import make_atari, wrap_deepmind, wrap_pytorch
from common.utils import mini_batch_train_frames

env_id = "PongNoFrameskip-v4"
env = make_atari(env_id)
env = wrap_deepmind(env)
env = wrap_pytorch(env)

MAX_FRAMES = 1000000
BATCH_SIZE = 32

agent = DQNAgent(env, use_conv=True)
if torch.cuda.is_available():
    agent.model.cuda()

episode_rewards = mini_batch_train_frames(env, agent, MAX_FRAMES, BATCH_SIZE)
Ejemplo n.º 18
0
# -*- coding: utf-8 -*-
import numpy as np
from turtlebot_env.basic_env import environ
from agents.dqn import DQNAgent
import argparse


def arguments():
    return


if __name__ == '__main__':
    env = environ()
    agent = DQNAgent(env, args)
    agent.train()
Ejemplo n.º 19
0
environmentVectorSize=10 + (teamSize -1)*11 +3 * opponentsSize
print environmentVectorSize
# Next, we build a very simple model.


# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=50000, window_length=1)
policy = BoltzmannQPolicy()

import Queue as Q
agents=[]
q=Q.Queue()
u2=team[:]
for i in team:
    d= DQNAgent( i,u2,opponents,actions,actionsEnum,inputV=environmentVectorSize, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,target_model_update=1e-2, policy=policy, )
    d.compile(Adam(lr=1e-3), metrics=['mae'])
    agents.append(d)

agentContainers=[]
for i in agents:
    agentContainers.append(AgentContainer(i,i.id,teamSize,opponentsSize,q,[],[]))


startPool(agentContainers)



# Join Environment thread
env.end()