Ejemplo n.º 1
0
    def __init__(self, alpha, discount, environment, epsilon=0.2):
        super().__init__(alpha, discount, environment)

        self.optimal_policy = policy.GreedyPolicy(environment.state_space,
                                                  environment.action_space,
                                                  self.qvalues)
        self.explore_policy = policy.EpsilonGreedyPolicy(
            environment.state_space, environment.action_space, self.qvalues,
            epsilon)
    def __init__(self, network, max_len_memory, to_observe, pol, gamma,
                 log_dir, load_prev, game):

        self.env = wrap_dqn(gym.make(game))
        self.env.seed(19)
        # self.action_meaning = self.env.env.get_action_meanings()
        self.env._max_episode_steps = None
        self.model = network
        network.model.summary()
        self.batch_size = 32 * 3
        self.to_observe = to_observe
        self.state_size = network.state_size
        self.action_size = network.action_size
        self.log_dir = log_dir
        self.depth = network.depth
        # self.lives = self.env.env.ale.lives()

        if not os.path.exists(self.log_dir):
            os.makedirs(self.log_dir)

        attr = {
            'batch size': self.batch_size,
            'to observe': self.to_observe,
            'depth': self.depth
        }

        self.results = {'info': attr}

        self.memory = SimpleMemory(max_len=max_len_memory)

        if load_prev:
            path = sorted([
                int(x) for x in os.listdir(self.log_dir)
                if os.path.isdir(os.path.join(self.log_dir, x))
            ])
            if len(path) != 0:
                load_prev = self.load(os.path.join(self.log_dir,
                                                   str(path[-1])))
            else:
                load_prev = False

        if not load_prev:
            if pol is None:
                self.pol = policy.GreedyPolicy()
            else:
                self.pol = pol

            if gamma is None:
                gamma = policy.EpsPolicy(0.95)
            elif isinstance(gamma, float):
                gamma = policy.EpsPolicy(gamma)

            if isinstance(gamma, policy.AnnealedPolicy):
                self.gamma = gamma.linear_step
            elif isinstance(gamma, policy.Policy):
                self.gamma = gamma.get_value
Ejemplo n.º 3
0
def exp_ddqn():
    import matplotlib.pyplot as plt
    eps = 1000

    env = gym.make('CartPole-v0')
    env.seed(19)

    pol = policy.AnnealedPolicy(inner_policy=policy.EpsPolicy(1.0, other_pol=policy.GreedyPolicy()), attr='eps',
                                value_max=1.0, value_min=0.1, value_test=0.5, nb_steps=500)

    log_dir = './logs/prova_pole'+pol.name

    n = models.DenseDQN(log_dir=log_dir, action_size=env.action_space.n, state_size=env.observation_space.shape[0],
                        layer_size=(24, 24), lr=0.001)

    a = Agent(game=env, net=n, log_dir=log_dir, pol=pol)

    r = a.learn(eps, False, 10, verbose=False)
    plt.plot(range(eps), r, label='DQN')

    for i in [50, 100, 200, 300, 500, 750, 1000, 2000, 3000]:
        pol = policy.AnnealedPolicy(inner_policy=policy.EpsPolicy(1.0, other_pol=policy.GreedyPolicy()), attr='eps',
                                    value_max=1.0, value_min=0.1, value_test=0.5, nb_steps=500)

        log_dir = './logs/prova_pole'+pol.name

        n = models.DoubleDQNWrapper(network=models.DenseDQN(log_dir=log_dir, action_size=env.action_space.n,
                                    state_size=env.observation_space.shape[0], layer_size=(24, 24), lr=0.001),
                                    update_time=i)

        a = Agent(game=env, net=n, log_dir=log_dir, pol=pol)

        r = a.learn(eps, False, 10, verbose=False)

        plt.plot(range(eps), r, label='Update time: {}'.format(i))

    plt.legend()
    plt.savefig('exp_ddqn.png')
Ejemplo n.º 4
0
    def __init__(self, alpha, discount, env, episilon=0.2):
        super().__init__(alpha, discount, env)

        ssp = env.state_space
        asp = env.action_space

        self.optimal_policy = policy.GreedyPolicy(ssp, asp, self.qvalues,
                                                  episilon)
        self.explore_policy = policy.EpsilonGreedyPolicy(
            ssp, asp, self.qvalues, episilon)
        self.draw_policy = self.optimal_policy

        self.returns = defaultdict(list)
        self.episode = []
Ejemplo n.º 5
0
    def __init__(self, alpha, discount, environment):

        self.alpha = alpha
        self.discount = discount

        ssp = environment.state_space
        asp = environment.action_space

        self.action_space = asp
        self.qvalues = np.zeros((ssp, asp), np.float32)

        self.optimal_policy = policy.RandomPolicy(ssp, asp)
        self.explore_policy = self.optimal_policy
        self.draw_policy = policy.GreedyPolicy(ssp, asp, self.qvalues)
Ejemplo n.º 6
0
    def evaluate(self, env, num_episodes, max_episode_length=1000000):
        """Test the agent with a provided environment.
        
        If you have any layers that vary in behavior between train/test time
        (such as dropout or batch norm), you should set them to test.

        - Input
          - env: gym.Env
            This is the Atari environment. Need to wrap the
            environment using the wrap_atari_env function in the
            utils.py
          - num_iterations: int
            How many samples/updates to perform.
          - max_episode_length: int
            How long a single episode should last before the agent

        - Output
          - total: float
            the cumulative rewards from all episodes
        """
        total = 0.0
        greedy = policy.GreedyPolicy()

        for episode in range(num_episodes):
            #print total
            observation = env.reset()
            state = self.preprocessor.process_state_for_network(observation)
            state = state[:, :, None]
            history = [
                np.zeros(state.shape),
                np.zeros(state.shape),
                np.zeros(state.shape), state
            ]
            for t in range(max_episode_length):
                env.render()
                state = np.vstack(
                    (history[0], history[1], history[2], history[3]))
                action = self.select_action(state[None, :], greedy)
                observation, reward, done, info = env.step(action)
                state = self.preprocessor.process_state_for_network(
                    observation)
                state = state[:, :, None]
                history = history[1:]
                history.append(state)
                total += reward
                if done:
                    break

        return total
Ejemplo n.º 7
0
    def __init__(self, game, net, max_memory=5000, log_dir='./logs/prova_cartpole', weight_name=None,
                 pol=None, agent_name='agent'):

        self.env = game
        self.env._max_episode_steps = 500
        self.name = agent_name

        self.model = net
        self.memory = deque(maxlen=max_memory)
        self.state_size = net.state_size
        self.action_size = net.action_size
        self.log_dir = os.path.join(log_dir, agent_name)

        #self.model.model.summary()

        if pol is None:
            self.pol = policy.GreedyPolicy()
        else:
            self.pol = pol

        self.episodes_to_watch = 32
        self.batch_size = 32
        self.gamma = 0.95
Ejemplo n.º 8
0
    def update_network(self):
        """Update the Q-network.

        """
        p = policy.GreedyPolicy()
        if self.replay:
            # obtain batch from replay memory
            batch = self.memory.sample(self.batch_size)
            batch = self.preprocessor.process_batch(batch)
            size = len(batch)
        else:
            # obtain batch from memory and process batch
            n = len(self.memory)
            batch = list()
            for i in range(3, n - 1):
                state, a, r, nexts, done = self.memory[i]
                state, nexts = self.memory[i - 3][0], self.memory[i - 3][3]
                for x in range(2, -1, -1):
                    state = np.vstack((state, self.memory[i - x][0]))
                    if done == False:
                        nexts = np.vstack((nexts, self.memory[i - x][3]))
                    else:
                        nexts = None
                batch.append((state, a, r, nexts, done))
            size = len(batch) - 1

        # obtain data from batch
        inputs, actions, outputs, nextinputs, terminal = batch[0]
        inputs = inputs[None, :]
        if terminal == False:
            nextinputs = nextinputs[None, :]
            terminal = self.gamma
        else:
            nextinputs = inputs
            terminal = 0
        for i, sample in enumerate(batch):
            if i == 0:
                continue
            state, action, reward, nexts, is_terminal = sample
            state = state[None, :]
            inputs = np.vstack((inputs, state))
            actions = np.hstack((actions, action))
            outputs = np.hstack((outputs, reward))
            if is_terminal == False:
                nexts = nexts[None, :]
                nextinputs = np.vstack((nextinputs, nexts))
                terminal = np.vstack((terminal, self.gamma))
            else:
                nextinputs = np.vstack((nextinputs, state))
                terminal = np.vstack((terminal, 0))

        # calculate target values for the target network
        next_y = self.target_network.predict(nextinputs, batch_size=size)
        next_y = next_y * terminal * self.gamma
        target_y = self.q_network.predict(inputs, batch_size=size)

        # apply double DQN or not
        if self.double:
            target_actions = np.argmax(target_y, axis=1)
            outputs = outputs + next_y[range(next_y.shape[0]), target_actions]
        else:
            outputs = outputs + np.max(next_y, axis=1)

        target_y[range(target_y.shape[0]), actions] = outputs

        # do gradient descent
        self.q_network.fit(inputs, target_y, batch_size=size, verbose=0)
    def __init__(self, config=None):

        if config is None:
            config = {}
        self.env = wrap_dqn(gym.make(config.get('game', 'PongNoFrameskip-v4')))
        self.action_size = self.env.action_space.n

        self.to_vis = config.get('visualize', False)
        self.verbose = config.get('verbose', True)
        self.backup = config.get('backup', 25)
        self.episodes = config.get('episodes', 300)

        self.depth = config.get('depth', 4)
        self.state_size = config.get('space', (84, 84))
        self.model = None
        self._target_model = None

        self.prioritized = config.get(('prioritized', False))

        if self.prioritized:
            self.memory = PrioritizedMemory(
                max_len=config.get('mem_size', 100000))
        else:
            self.memory = SimpleMemory(max_len=config.get('mem_size', 100000))

        if config.get('duel', False):
            self.model = self._duel_conv()
        else:
            self.model = self._conv()

        self.model.compile(Adam(lr=config.get('lr', 1e-4)), loss=huber_loss)

        if config.get('target', True):
            self._target_model = clone_model(self.model)
            self._target_model.set_weights(self.model.get_weights())
            self._time = 0
            self.update_time = config.get('target_update', 1000)

        self.env._max_episode_steps = None
        self.batch_size = config.get('batch', 32 * 3)
        self.to_observe = config.get('to_observe', 10000)

        self.log_dir = config['log_dir']
        if not os.path.exists(self.log_dir):
            os.makedirs(self.log_dir)
        plot_model(self.model,
                   to_file=os.path.join(self.log_dir, 'model.png'),
                   show_shapes=True)

        attr = {
            'batch size': self.batch_size,
            'to observe': self.to_observe,
            'depth': self.depth
        }

        self.results = {'info': attr}

        load_prev = config.get('load', False)

        self.gamma = None
        pol = None

        if 'pol' in config:
            if config['pol'] == 'random':
                pol = policy.RandomPolicy()
            elif config['pol'] == 'eps':
                pol = policy.EpsPolicy(config.get('pol_eps', 0.1))

        self.pol = pol

        if load_prev:
            path = sorted([
                int(x) for x in os.listdir(self.log_dir)
                if os.path.isdir(os.path.join(self.log_dir, x))
            ])
            if len(path) != 0:
                load_prev = self.load(os.path.join(self.log_dir,
                                                   str(path[-1])))

        if self.pol is None:
            self.pol = policy.AnnealedPolicy(
                inner_policy=policy.EpsPolicy(1.0,
                                              other_pol=policy.GreedyPolicy()),
                attr='eps',
                value_max=1.0,
                value_min=config.get('ex_min', 0.02),
                value_test=0.5,
                nb_steps=config.get('ex_steps', 100000))
        if self.gamma is None:
            self.gamma = policy.EpsPolicy(float(config.get('gamma',
                                                           0.99))).get_value
# os.environ['THEANO_FLAGS'] = "device=cuda,floatX=float32"
# os.environ['CPLUS_INCLUDE_PATH'] = '/usr/local/cuda-9.0/include'

import sys
sys.path.append('..')
import policy
from dqn.agent_with_depth_less_memory import ImageAgent as ia_less
from dqn.models_with_depth import DenseDQN, DoubleDQNWrapper, ConvDQM, ConvDDQN

n = ConvDQM(action_size=6, state_size=(84, 84), depth=4, lr=1e-4)

n = DoubleDQNWrapper(n, 10000)

# n = DenseDQN(action_size=3, state_size=6, depth=4, lr=0.001, layer_size=(64, 64))
pol = policy.AnnealedPolicy(inner_policy=policy.EpsPolicy(
    1.0, other_pol=policy.GreedyPolicy()),
                            attr='eps',
                            value_max=1.0,
                            value_min=0.02,
                            value_test=0.5,
                            nb_steps=100000)

agent = ia_less(pol=pol,
                network=n,
                to_observe=10000,
                max_len_memory=100000,
                log_dir='../pong/good_wrappers_DDQN_32x3-8/',
                load_prev=True,
                gamma=0.99)

# agent = ram_less(pol=pol, network=n, to_observe=50000, max_len_memory=1000000,
Ejemplo n.º 11
0
def exp_double_duel():
    import matplotlib.pyplot as plt
    eps = 1000

    env = gym.make('CartPole-v0')
    env.seed(19)

    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n


    log_dir = './logs/prova_pole'

    net = models.DenseDQN(log_dir=log_dir, action_size=action_size, state_size=state_size, layer_size=(24, 24),
                          lr=0.001)

    a = Agent(game=env, net=net, log_dir=log_dir, pol=policy.AnnealedPolicy(inner_policy=policy.EpsPolicy(1.0, other_pol=policy.GreedyPolicy()), attr='eps',
                                value_max=1.0, value_min=0.1, value_test=0.5, nb_steps=500))

    r = a.learn(eps, False, 10, verbose=False)
    plt.plot(range(eps), r, label='DQN')

    net = models.DuelDenseDQN(log_dir=log_dir, action_size=action_size, state_size=state_size, layer_size=(24, 24),
                              lr=0.001, layer_size_val=(4, 4))
    env.seed(19)
    a = Agent(game=env, net=net, log_dir=log_dir, pol=policy.AnnealedPolicy(inner_policy=policy.EpsPolicy(1.0, other_pol=policy.GreedyPolicy()), attr='eps',
                                value_max=1.0, value_min=0.1, value_test=0.5, nb_steps=500))

    r = a.learn(eps, False, 10, verbose=False)
    plt.plot(range(eps), r, label='Duel DQN 4 4')

    for i in [50, 100, 200, 300, 500, 750, 1000, 2000, 3000]:

        net = models.DuelDenseDQN(log_dir=log_dir, action_size=action_size, state_size=state_size, layer_size=(24, 24),
                                  lr=0.001, layer_size_val=(4, 4))
        n = models.DoubleDQNWrapper(network=net, update_time=i)

        a = Agent(game=env, net=n, log_dir=log_dir, pol=policy.AnnealedPolicy(inner_policy=policy.EpsPolicy(1.0, other_pol=policy.GreedyPolicy()), attr='eps',
                                value_max=1.0, value_min=0.1, value_test=0.5, nb_steps=500))

        r = a.learn(eps, False, 10, verbose=False)

        plt.plot(range(eps), r, label='Double Duel DQN 4 4 '+str(i))

    plt.legend()
    plt.savefig('exp_double_duel.png')
Ejemplo n.º 12
0
def exp_duel():
    import matplotlib.pyplot as plt
    eps = 1000

    env = gym.make('CartPole-v0')
    env.seed(19)

    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n

    pol = policy.AnnealedPolicy(inner_policy=policy.EpsPolicy(1.0, other_pol=policy.GreedyPolicy()), attr='eps',
                                value_max=1.0, value_min=0.1, value_test=0.5, nb_steps=500)

    log_dir = './logs/prova_pole' + pol.name

    net = models.DenseDQN(log_dir=log_dir, action_size=action_size, state_size=state_size, layer_size=(24, 24),
                          lr=0.001)

    a = Agent(game=env, net=net, log_dir=log_dir, pol=pol)

    r = a.learn(eps, False, 10, verbose=False)
    print(r[-1])
    plt.plot(range(eps), r, label='DQN')

    pol = policy.AnnealedPolicy(inner_policy=policy.EpsPolicy(1.0, other_pol=policy.GreedyPolicy()), attr='eps',
                                value_max=1.0, value_min=0.1, value_test=0.5, nb_steps=500)

    net = models.DuelDenseDQN(log_dir=log_dir, action_size=action_size, state_size=state_size, layer_size=(24, 24),
                              lr=0.001, layer_size_val=(12, 12))
    env.seed(19)
    a = Agent(game=env, net=net, log_dir=log_dir, pol=pol)

    r = a.learn(eps, False, 10, verbose=False)
    print(r[-1])
    plt.plot(range(eps), r, label='Duel DQN 12 12')

    pol = policy.AnnealedPolicy(inner_policy=policy.EpsPolicy(1.0, other_pol=policy.GreedyPolicy()), attr='eps',
                                value_max=1.0, value_min=0.1, value_test=0.5, nb_steps=500)

    net = models.DuelDenseDQN(log_dir=log_dir, action_size=action_size, state_size=state_size, layer_size=(24, 24),
                              lr=0.001, layer_size_val=(8, 8))
    env.seed(19)
    a = Agent(game=env, net=net, log_dir=log_dir, pol=pol)

    r = a.learn(eps, False, 10, verbose=False)
    print(r[-1])
    plt.plot(range(eps), r, label='Duel DQN 8 8')

    pol = policy.AnnealedPolicy(inner_policy=policy.EpsPolicy(1.0, other_pol=policy.GreedyPolicy()), attr='eps',
                                value_max=1.0, value_min=0.1, value_test=0.5, nb_steps=500)

    net = models.DuelDenseDQN(log_dir=log_dir, action_size=action_size, state_size=state_size, layer_size=(24, 24),
                              lr=0.001, layer_size_val=(4, 4))

    a = Agent(game=env, net=net, log_dir=log_dir, pol=pol)

    r = a.learn(eps, False, 10, verbose=False)
    plt.plot(range(eps), r, label='Duel DQN 4 4')
    print(r[-1])

    pol = policy.AnnealedPolicy(inner_policy=policy.EpsPolicy(1.0, other_pol=policy.GreedyPolicy()), attr='eps',
                                value_max=1.0, value_min=0.1, value_test=0.5, nb_steps=500)

    net = models.DuelDenseDQN(log_dir=log_dir, action_size=action_size, state_size=state_size, layer_size=(24, 24),
                              lr=0.001, layer_size_val=(24, 24))

    a = Agent(game=env, net=net, log_dir=log_dir, pol=pol)

    r = a.learn(eps, False, 10, verbose=False)
    plt.plot(range(eps), r, label='Duel DQN 24 24')
    print(r[-1])
    plt.legend()
    plt.savefig('exp_duel.png')
    def __init__(self,
                 network,
                 max_len_memory=20000,
                 to_observe=5000,
                 pol=None,
                 gamma=None,
                 log_dir='',
                 load_prev=False,
                 game='Breakout-ramDeterministic-v4'):

        self.env = gym.make(game)
        self.env.seed(19)
        print(self.env.observation_space.shape[0], self.env.action_space.n,
              self.env.env.get_action_meanings())
        self.action_meaning = self.env.env.get_action_meanings()
        print(network.model.summary())
        self.no_op_ep = 30
        self.env._max_episode_steps = None
        self.model = network
        self.batch_size = 32
        self.to_observe = to_observe
        self.state_size = network.state_size
        self.action_size = network.action_size
        self.log_dir = log_dir
        self.depth = network.depth

        if not os.path.exists(self.log_dir):
            os.makedirs(self.log_dir)

        attr = {
            'batch size': self.batch_size,
            'to observe': self.to_observe,
            'depth': self.depth,
            'no_op_ep': 30
        }

        self.results = {'info': attr}

        self.memory = PrioritizedMemory(max_len=max_len_memory)

        if load_prev:
            path = sorted([
                int(x) for x in os.listdir(self.log_dir)
                if os.path.isdir(os.path.join(self.log_dir, x))
            ])
            if len(path) != 0:
                load_prev = self.load(os.path.join(self.log_dir,
                                                   str(path[-1])))
            else:
                load_prev = False

        if not load_prev:
            if pol is None:
                self.pol = policy.GreedyPolicy()
            else:
                self.pol = pol

            if gamma is None:
                gamma = policy.EpsPolicy(0.99)

            if isinstance(gamma, policy.AnnealedPolicy):
                self.gamma = gamma.linear_step
            elif isinstance(gamma, policy.Policy):
                self.gamma = gamma.get_value