Exemple #1
0
    def __init__(self,
                 model,
                 policy=None,
                 test_policy=None,
                 varTH=1e-5,
                 *args,
                 **kwargs):
        super(ADFQAgent, self).__init__(*args, **kwargs)

        # Validate (important) input.
        if hasattr(model.output, '__len__') and len(model.output) > 1:
            raise ValueError(
                'Model "{}" has more than one output. ADFQ expects a model that has a single output.'
                .format(model))
        if model.output._keras_shape != (None, 2 * self.nb_actions):
            raise ValueError(
                'Model output "{}" has invalid shape. ADFQ expects a model that has one dimension for each action, in this case {}.'
                .format(model.output, self.nb_actions))

        # Related objects.
        self.model = model
        if policy is None:
            policy = EpsGreedyQPolicy()
        if test_policy is None:
            test_policy = GreedyQPolicy()
        self.policy = policy
        self.test_policy = test_policy
        self.varTH = np.float32(varTH)

        # State.
        self.reset_states()
Exemple #2
0
def make_dqn_rl_agent(processor: Processor_56x5,
                      nbr_layers=2,
                      enable_dueling_network: bool = False,
                      enable_double_dqn: bool = True):
    """
    
    :param processor: 
    :param nbr_layers: 
    :param enable_dueling_network:
    :param enable_double_dqn:
    :return: 
    """

    model = processor.create_model(nbr_layers=nbr_layers)
    test_policy = GreedyQPolicy()
    memory = SequentialMemory(limit=50000, window_length=1)

    dqn_agent = DQNAgent(model=model,
                         nb_actions=NBR_TICHU_ACTIONS,
                         memory=memory,
                         nb_steps_warmup=100,
                         target_model_update=1e-2,
                         test_policy=test_policy,
                         processor=processor,
                         enable_dueling_network=enable_dueling_network,
                         enable_double_dqn=enable_double_dqn)
    dqn_agent.compile(Adam(lr=1e-3), metrics=['mae'])
    return dqn_agent
Exemple #3
0
    def __init__(self,
                 model,
                 nb_actions,
                 policy=None,
                 test_policy=None,
                 gamma=.99,
                 nb_steps_warmup=10,
                 train_interval=1,
                 delta_clip=np.inf,
                 *args,
                 **kwargs):
        super(SarsaAgent, self).__init__(*args, **kwargs)

        # Do not use defaults in constructor because that would mean that each instance shares the same
        # policy.
        if policy is None:
            policy = EpsGreedyQPolicy()
        if test_policy is None:
            test_policy = GreedyQPolicy()

        self.model = model
        self.nb_actions = nb_actions
        self.policy = policy
        self.test_policy = test_policy
        self.gamma = gamma
        self.nb_steps_warmup = nb_steps_warmup
        self.train_interval = train_interval

        self.delta_clip = delta_clip
        self.compiled = False
        self.actions = None
        self.observations = None
        self.rewards = None
def build_agent(observation_space_shape, num_actions):
    # Experience replay
    WARMUP_STEPS = 1000  # Collect the first steps before start experience replay
    MEM_LIMIT = 1000  # Max number of steps to store
    MEM_WINDOW_LEN = 1  # Experience of lenght 1 (single step)
    # Target network
    TARGET_MODEL_UPD_RATE = 1e-2  # Update target network with this rate
    # Build network, exp. replay and policy
    model = build_model(observation_space_shape, num_actions)
    replay_memory = SequentialMemory(limit=MEM_LIMIT,
                                     window_length=MEM_WINDOW_LEN)
    policy = GreedyQPolicy()

    # Finally build the agent
    GAMMA = 1
    #dqn = DQNAgent(model=model, gamma=GAMMA, nb_actions=num_actions, memory=replay, nb_steps_warmup=WARMUP_STEPS,
    #              target_model_update=TARGET_MODEL_UPD_RATE, policy=policy)
    # dqn.compile(Adam(lr=1e-3), metrics=['mae'])
    reinforce = REINFORCE(model,
                          replay_memory,
                          GAMMA,
                          batch_size=1,
                          nb_steps_warmup=WARMUP_STEPS)
    reinforce.compile(optimizer='sgd', metrics=['mae'])
    return reinforce
Exemple #5
0
    def __init__(self, model, policy=None, test_policy=None, enable_double_dqn=False, enable_dueling_network=False,
                 dueling_type='avg', *args, **kwargs):
        super(ADFQAgent, self).__init__(*args, **kwargs)

        # Validate (important) input.
        if hasattr(model.output, '__len__') and len(model.output) > 1:
            raise ValueError('Model "{}" has more than one output. ADFQN expects a model that has a single output.'.format(model))
        if model.output._keras_shape != (None, self.nb_actions*2):
            raise ValueError('Model output "{}" has invalid shape. ADFQN expects a model that has one dimension for each action, in this case {}.'.format(model.output, self.nb_actions))
        print("ADFQ")
        # Parameters.
        self.enable_double_dqn = enable_double_dqn
        self.enable_dueling_network = enable_dueling_network
        self.dueling_type = dueling_type
        if self.enable_dueling_network:
            # It is not Dueling Network, it is just separate network.
            # get the second last layer of the model, abandon the last layer
            NotImplementedError

        # Related objects.
        self.model = model
        if policy is None:
            policy = EpsGreedyQPolicy()
        if test_policy is None:
            test_policy = GreedyQPolicy()
        self.policy = policy
        self.test_policy = test_policy

        # State.
        self.reset_states()
Exemple #6
0
def parse_policy(args) -> Policy:
    pol: Policy = EpsGreedyQPolicy()
    if args.policy == 'LinearAnnealedPolicy':
        pol = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                                   attr='eps',
                                   value_max=1.,
                                   value_min=.1,
                                   value_test=0.05,
                                   nb_steps=args.zeta_nb_steps)
    if args.policy == 'SoftmaxPolicy':
        pol = SoftmaxPolicy()
    if args.policy == 'EpsGreedyQPolicy':
        pol = EpsGreedyQPolicy()
    if args.policy == 'GreedyQPolicy':
        pol = GreedyQPolicy()
    if args.policy == 'BoltzmannQPolicy':
        pol = BoltzmannQPolicy()
    if args.policy == 'MaxBoltzmannQPolicy':
        pol = MaxBoltzmannQPolicy()
    if args.policy == 'BoltzmannGumbelQPolicy':
        pol = BoltzmannGumbelQPolicy()
    if args.policy == 'ZetaPolicy':
        pol = ZetaPolicy(zeta_nb_steps=args.zeta_nb_steps, eps=args.eps)

    return pol
Exemple #7
0
    def __init__(self, model, policy=None, test_policy=None, enable_double_dqn=False, enable_dueling_network=False,
                 dueling_type='avg', *args, **kwargs):
        super(DQNAgent, self).__init__(*args, **kwargs)

        # Validate (important) input.
        if hasattr(model.output, '__shape__') and len(model.output.shape) > 2:
            raise ValueError(
                'Model "{}" has more than one output. DQN expects a model that has a single output.'.format(model))
        if model.output._keras_shape != (None, self.nb_actions):
            raise ValueError(
                'Model output "{}" has invalid shape. DQN expects a model that has one dimension for each action, in this case {}.'.format(
                    model.output, self.nb_actions))

        # Parameters.
        self.enable_double_dqn = enable_double_dqn
        self.enable_dueling_network = enable_dueling_network
        self.dueling_type = dueling_type
        if self.enable_dueling_network:
            # get the second last layer of the model, abandon the last layer
            layer = model.layers[-2]
            nb_action = model.output._keras_shape[-1]
            # layer y has a shape (nb_action+1,)
            # y[:,0] represents V(s;theta)
            # y[:,1:] represents A(s,a;theta)
            y = Dense(nb_action + 1, activation='linear')(layer.output)
            # caculate the Q(s,a;theta)
            # dueling_type == 'avg'
            # Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-Avg_a(A(s,a;theta)))
            # dueling_type == 'max'
            # Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-max_a(A(s,a;theta)))
            # dueling_type == 'naive'
            # Q(s,a;theta) = V(s;theta) + A(s,a;theta)
            if self.dueling_type == 'avg':
                outputlayer = Lambda(
                    lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:] - K.mean(a[:, 1:], axis=1, keepdims=True),
                    output_shape=(nb_action,))(y)
            elif self.dueling_type == 'max':
                outputlayer = Lambda(
                    lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:] - K.max(a[:, 1:], axis=1, keepdims=True),
                    output_shape=(nb_action,))(y)
            elif self.dueling_type == 'naive':
                outputlayer = Lambda(lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:], output_shape=(nb_action,))(y)
            else:
                assert False, "dueling_type must be one of {'avg','max','naive'}"

            model = Model(inputs=model.input, outputs=outputlayer)

        # Related objects.
        self.model = model
        if policy is None:
            policy = EpsGreedyQPolicy()
        if test_policy is None:
            test_policy = GreedyQPolicy()
        self.policy = policy
        self.test_policy = test_policy

        # State.
        self.reset_states()
Exemple #8
0
def make_sarsa_rl_agent(processor: Processor_56x5, nbr_layers=2):
    model = processor.create_model(nbr_layers=nbr_layers)
    test_policy = GreedyQPolicy()

    sarsa_agent = SarsaAgent(model=model,
                             nb_actions=NBR_TICHU_ACTIONS,
                             nb_steps_warmup=10,
                             gamma=0.99,
                             test_policy=test_policy,
                             processor=processor)
    sarsa_agent.compile(Adam(lr=1e-3), metrics=['mae'])
    return sarsa_agent
Exemple #9
0
        def __init__(self,
                     model,
                     policy=None,
                     test_policy=None,
                     enable_double_dqn=False,
                     enable_dueling_network=False,
                     dueling_type='avg',
                     *args,
                     **kwargs):
            super(DQNAgent, self).__init__(*args, **kwargs)

            if model.output._keras_shape != (None, self.nb_actions):
                raise ValueError(
                    f'Model output "{model.output}" has invalid shape. Dqn expects '
                    +
                    f'a model that has one dimension for each action, in this case {self.nb_actions}.'
                )

            self.enable_double_dqn = enable_double_dqn
            self.enable_dueling_network = enable_dueling_network
            self.dueling_type = dueling_type
            if self.enable_dueling_network:
                layer = model.layers[-2]
                nb_action = model.output._keras_shape[-1]
                y = Dense(nb_action + 1, activation='linear')(layer.output)
                if self.dueling_type == 'avg':
                    outputlayer = Lambda(
                        lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:] - K.
                        mean(a[:, 1:], keepdims=True),
                        output_shape=(nb_action, ))(y)
                elif self.dueling_type == 'max':
                    outputlayer = Lambda(
                        lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:] - K.
                        max(a[:, 1:], keepdims=True),
                        output_shape=(nb_action, ))(y)
                elif self.dueling_type == 'naive':
                    outputlayer = Lambda(
                        lambda a: K.expand_dims(a[:, 0], -1) + a[:, 1:],
                        output_shape=(nb_action, ))(y)
                else:
                    assert False, "dueling_type must be one of {'avg','max','naive'}"
                model = Model(inputs=model.input, outputs=outputlayer)

            self.model = model
            if policy is None:
                policy = EpsGreedyQPolicy()
            if test_policy is None:
                test_policy = GreedyQPolicy()
            self.policy = policy
            self.test_policy = test_policy
            self.reset_states()
Exemple #10
0
 def compile_agent(self):
     # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
     # even the metrics!
     processor = DistopiaProcessor(self.num_blocks, self.num_actions)
     #memory = SequentialMemory(limit=50000, window_length=1)
     #policy = PatchedBoltzmannQPolicy(num_actions = self.num_actions, num_blocks = self.num_blocks)
     #test_policy = PatchedGreedyQPolicy(num_actions = self.num_actions, num_blocks = self.num_blocks)
     policy = BoltzmannQPolicy()
     test_policy = GreedyQPolicy()
     self.sarsa = SARSAAgent(model=self.model,
                             processor=processor,
                             nb_actions=self.nb_actions,
                             nb_steps_warmup=1000,
                             policy=policy,
                             test_policy=test_policy,
                             gamma=0.9)
     self.sarsa.compile(Adam(lr=1e-3), metrics=['mae'])
Exemple #11
0
#model = networks.lstm_network(window_length, input_shape[0], nb_actions)

####################################################################

memory = SequentialMemory(limit=memory_limit, window_length=window_length)

####################################################################

policy = EpsGreedyQPolicy(eps=eps)
policy = LinearAnnealedPolicy(policy,
                              attr='eps',
                              value_max=eps,
                              value_min=0,
                              value_test=0,
                              nb_steps=nb_steps)
test_policy = GreedyQPolicy()

####################################################################

dqn = DQNAgent(model=model,
               nb_actions=nb_actions,
               memory=memory,
               nb_steps_warmup=window_length + batch_size,
               target_model_update=0.02,
               policy=policy,
               test_policy=test_policy,
               batch_size=batch_size,
               train_interval=train_interval,
               gamma=gamma)

dqn.compile(Adam(lr=0.00025), metrics=['mae'])
Exemple #12
0
    def __init__(self, inputs, buffer, sess_id, sess, **kwargs):
        self.util = Utility()
        self.sess = sess
        self.sess_id = sess_id

        game = inputs['game']
        agnt = inputs['agent']
        sess = agnt['session']
        eps = sess['episode']
        mod = inputs['model']
        trn = mod['training']
        sv = mod['save']
        mem = inputs['memory']
        '''---Environment Paramters---'''
        self.env_name = game['name']
        self.fps = game['fps']
        self.mode = game['difficulty']
        self.target = game['target']
        self.tick = game['tick']
        '''---Episode Parameters---'''
        self.nb_episodes = sess['max_ep']
        self.nb_max_episode_steps = game['fps'] * 60 * eps['max_time']
        self.nb_steps = self.nb_max_episode_steps * self.nb_episodes
        self.nb_steps_warmup = trn['warmup']
        self.nb_max_start_steps = trn['max_ep_observe']
        self.max_start_steps = trn['warmup']
        self.keep_gif_score = eps['keep_gif_score']
        '''---Agent / Model Parameters---'''
        self.name = agnt['name']
        self.nb_actions = agnt['action_size']
        self.delta_clip = agnt['delta_clip']

        self.training = trn['training']
        self.verbose = trn['verbose']
        self.lr = trn['learn_rate']
        self.eps = trn['initial_epsilon']
        self.value_max = trn['initial_epsilon']
        self.value_min = trn['terminal_epsilon']
        self.anneal = trn['anneal']
        self.shuffle = trn['shuffle']
        self.train_interval = trn['interval']
        self.validate = trn['validate']
        self.split = trn['split']
        self.action_repetition = trn['action_repetition']
        self.epochs = trn['epochs']
        self.epoch = 1

        prec = km.binary_precision()
        re = km.binary_recall()
        f1 = km.binary_f1_score()
        self.metrics = ['accuracy', 'mse', prec, re, f1]
        self.H = mod['filter_size']
        self.alpha = mod['alpha']
        self.gamma = mod['gamma']
        self.momentum = mod['momentum']
        self.decay = mod['decay']
        self.target_model_update = mod['target_update']
        self.type = mod['type']
        self.enable_double_dqn = mod['double_dqn']
        self.enable_dueling_network = mod['dueling_network']
        self.dueling_type = mod['dueling_type']

        self.limit = mem['limit']
        self.batch_size = mem['batch_size']
        self.window_length = mem['state_size']
        self.memory_interval = mem['interval']

        self.ftype = sv['ftype']

        self.vizualize = sv['visualize']
        self.save_full = sv['save_full']
        self.save_weights = sv['save_weights']
        self.save_json = sv['save_json']
        self.save_plot = sv['save_plot']
        self.save_interval = sv['save_n']
        self.log_interval = sv['log_n']
        self.saves = sv['save_path']
        self.save_path = self.util.get_save_dir_struct(self.saves,
                                                       self.env_name)
        self.logs = sv['log_path']
        self.util.display_status('Hyperparameters Successfully Loaded')
        '''Reference/Excerpt:  keras-rl DQN Atari Example
        https://github.com/keras-rl/keras-rl/blob/master/examples/dqn_atari.py
        # Select a policy. 
        # We use eps-greedy action selection, which means that a random action
        # is selected with probability eps. We anneal eps from init to term over 
        # the course of (anneal) steps. This is done so that the agent initially 
        # explores the environment (high eps) and then gradually sticks to 
        # what it knows (low eps). We also set a dedicated eps value that is 
        # used during testing. Note that we set it to 0.05 so that the agent 
        # still performs some random actions. 
        # This ensures that the agent cannot get stuck.
        # '''
        self.custom_model_objects = {
            'S': self.window_length,
            'A': self.nb_actions,
            'H': self.H,
            'lr': self.lr,
            'name': self.name,
            'batch_size': self.batch_size,
            'sess': self.sess,
            #dueling_network=self.enable_dueling_network,
            #dueling_type=self.dueling_type,
        }

        with tf.device(gpu):
            self.policy = LinearAnnealedPolicy(
                inner_policy=EpsGreedyQPolicy(eps=self.value_max),
                attr='eps',
                value_max=self.value_max,
                value_min=self.value_min,
                value_test=self.alpha,
                nb_steps=self.anneal)
            self.test_policy = GreedyQPolicy()

            if mod['optimizer'].lower() == 'adamax':
                self.optimizer = Adamax(lr=self.lr)
            elif mod['optimizer'].lower() == 'adadelta':
                self.optimizer = Adadelta()
            elif mod['optimizer'].lower() == 'rmsprop':
                self.optimizer = RMSprop()
            elif mod['optimizer'].lower() == 'sgd':
                self.optimizer = SGD(
                    lr=self.lr,
                    momentum=self.momentum,
                    decay=self.decay,
                )
            else:
                self.optimizer = Adam(lr=self.lr)

        self.memory = buffer

        self.log_path = self.util.get_log_dir_struct(self.sess_id, self.logs,
                                                     self.ftype)

        self.util.display_status('Keras GPU Session {} Beginning'.format(
            self.sess_id))

        nn = NeuralNet(
            S=self.window_length,
            A=self.nb_actions,
            H=self.H,
            lr=self.lr,
            name=self.name,
            batch_size=self.batch_size,
            dueling_network=self.enable_dueling_network,
            dueling_type=self.dueling_type,
            sess=self.sess,
        )
        with tf.device(gpu):
            self.model = nn.get_model()

        self.util.display_status(
            '{} Keras Agent with {} Optimizer Built'.format(
                self.name, mod['optimizer']))
        '''---Compile the model with chosen optimizer
        loss is calculated with lamba function based on model
        type selections (dueling, or double dqn)'''
        with tf.device(gpu):
            self.compile(
                optimizer=self.optimizer,
                metrics=self.metrics,
            )

        self.util.display_status(
            '{} Agent Fully Initialized with Compiled Model'.format(self.name))

        super(BetaFlapDQN, self).__init__(
            model=self.model,
            nb_actions=self.nb_actions,
            memory=self.memory,
            policy=self.policy,
            test_policy=self.test_policy,
            enable_double_dqn=self.enable_double_dqn,
            enable_dueling_network=self.enable_dueling_network,
            dueling_type=self.dueling_type,
            **kwargs)
Exemple #13
0
def startDummy(env, Comm, tryHard=False):
    
    nb_actions = env.action_space.n


    layer0Size = 4096
    layer1Size = 4096
    layer2Size = 4096
    layer3Size = 0
    layer4Size = 0
    layer5Size = 1

    # Next, we build a very simple model. 
    model = Sequential()
    model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
    model.add(Dense(layer0Size))
    model.add(LeakyReLU(alpha=0.003))
    model.add(Dense(layer1Size))
    model.add(LeakyReLU(alpha=0.003))
    model.add(Dense(layer2Size))
    model.add(LeakyReLU(alpha=0.003))
    model.add(Dense(nb_actions))
    model.add(Activation('linear'))

    #A little diagnosis of the model summary
    print(model.summary())

    # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
    # even the metrics!
    memory = SequentialMemory(limit=800000, window_length=1)
    policy = GreedyQPolicy()
    dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, policy=policy, enable_dueling_network=True)
    dqn.compile(nadam(lr=0.001), metrics=['mae']) 

    #Load Previous training 
    previousfileLength = 0
    #Start traing
    # Ctrl + C.
    # We train and store 
    load_file_number = 39
    loadFile = "Larger_Memeory_BOARDSIZE_" + str(max_board_size) + "_DQN_LAYERS_" + str(layer0Size) + "_" + str(layer1Size) + "_" + str(layer2Size) + "_" + str(layer3Size) + "_" + str(layer4Size) + "_" + str(layer5Size) +  "_SAVENUMBER_" + str(load_file_number) + ".h5f"
    dqn.load_weights(loadFile)
        
    while(True):
        data = None
        while data == None:
            data = Comm.getNewData()
        observation, notUsed, currSafeMoves, headButtSafeMoves, noStuckMoves, foodMoves = env.findObservation(data=data)
        action = dqn.forward(observation)
        if action == 0:
            moveChosen = 'left' 
        if action == 1:
            moveChosen = 'right' 
        if action == 2:
            moveChosen = 'up' 
        if action == 3:
            moveChosen = 'down' 
        if moveChosen not in currSafeMoves and len(currSafeMoves) > 0:
            moveChosen = choice(currSafeMoves)
        if moveChosen not in noStuckMoves and len(noStuckMoves) > 0:
            moveChosen = choice(noStuckMoves)
        if moveChosen not in headButtSafeMoves and len(headButtSafeMoves) > 0:
            moveChosen = choice(headButtSafeMoves)
        
        if moveChosen not in foodMoves and len(foodMoves) > 0:
            moveChosen = choice(foodMoves)


        Comm.giveNewMove(moveChosen)
Exemple #14
0
    def _build_dqn_agent(self, params):
        NB_ACTIONS = 7

        # ----------------------------------------------------------------------------------------------------------------
        inputShape = (params['width'], params['height'], 3)

        model = Sequential()
        model.add(
            Conv2D(16, (3, 3),
                   input_shape=inputShape,
                   padding='same',
                   activation='relu'))
        model.add(Conv2D(32, (3, 3), padding='same', activation='relu'))
        model.add(MaxPooling2D(pool_size=(2, 2), padding='same'))
        model.add(NoisyNetDense(16, activation='linear'))
        model.add(Flatten())
        model.add(NoisyNetDense(NB_ACTIONS, activation='linear'))

        model.summary()
        # ----------------------------------------------------------------------------------------------------------------

        # Memory replay
        if not params['prio_memory']:
            print("Using Sequential memory")
            memory = SequentialMemory(limit=params['mem_size'],
                                      window_length=1)
        else:
            print("Using Prioritized memory")
            params['lr'] = params['lr'] / 4
            memory = PrioritizedMemory(limit=params['mem_size'],
                                       alpha=0.6,
                                       start_beta=0.5,
                                       end_beta=1.0,
                                       steps_annealed=params['annealing'],
                                       window_length=1)

        # Epsilon Greedy policy, linearly decreasing
        if not params['noisy_layer']:
            print("Using Annealed Eps Greedy policy")
            self.policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                                               attr='eps',
                                               value_max=params['eps'],
                                               value_min=params['eps_final'],
                                               value_test=0.0,
                                               nb_steps=params['annealing'])

        # Or Greedy policy in case of noisy layers
        else:
            print("Using Q Greedy policy (with noisy layer)")
            self.policy = GreedyQPolicy()

        # Keras DQN agent
        self._dqn = DQNAgent(
            model=model,
            nb_actions=NB_ACTIONS,
            policy=self.policy,
            memory=memory,
            batch_size=params['batch_size'],
            processor=WindowProcessor(),
            enable_double_dqn=True,
            enable_dueling_network=True,
            nb_steps_warmup=params['train_start'],
            gamma=params['discount'],
            target_model_update=1000,
            train_interval=1,
            delta_clip=1.,
            custom_model_objects={"NoisyNetDense": NoisyNetDense})

        self._dqn.compile(Adam(lr=params['lr']), metrics=['mae'])

        if params['load_file']:
            print("file loaded")
            self._dqn.load_weights(params['load_file'])
Exemple #15
0
    def __init__(self,
                 model,
                 turn_left_agent,
                 go_straight_agent,
                 turn_right_agent,
                 policy=None,
                 test_policy=None,
                 enable_double_dqn=False,
                 enable_dueling_network=False,
                 dueling_type='avg',
                 *args,
                 **kwargs):
        super(DQNAgent4Hrl, self).__init__(*args, **kwargs)

        # Parameters.
        self.enable_double_dqn = enable_double_dqn
        self.enable_dueling_network = enable_dueling_network
        self.dueling_type = dueling_type
        if self.enable_dueling_network:
            # get the second last layer of the model, abandon the last layer
            layer = model.layers[-2]
            nb_action = model.output._keras_shape[-1]
            # layer y has a shape (nb_action+1,)
            # y[:,0] represents V(s;theta)
            # y[:,1:] represents A(s,a;theta)
            y = layers.Dense(nb_action + 1, activation='linear')(layer.output)
            # caculate the Q(s,a;theta)
            # dueling_type == 'avg'
            # Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-Avg_a(A(s,a;theta)))
            # dueling_type == 'max'
            # Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-max_a(A(s,a;theta)))
            # dueling_type == 'naive'
            # Q(s,a;theta) = V(s;theta) + A(s,a;theta)
            if self.dueling_type == 'avg':
                outputlayer = Lambda(
                    lambda a: tf.expand_dims(a[:, 0], -1) + a[:, 1:] - tf.
                    reduce_mean(a[:, 1:], axis=1, keepdims=True),
                    output_shape=(nb_action, ))(y)
            elif self.dueling_type == 'max':
                outputlayer = Lambda(
                    lambda a: tf.expand_dims(a[:, 0], -1) + a[:, 1:] - tf.
                    reduce_max(a[:, 1:], axis=1, keepdims=True),
                    output_shape=(nb_action, ))(y)
            elif self.dueling_type == 'naive':
                outputlayer = Lambda(
                    lambda a: tf.expand_dims(a[:, 0], -1) + a[:, 1:],
                    output_shape=(nb_action, ))(y)
            else:
                assert False, "dueling_type must be one of {'avg','max','naive'}"

            model = Model(inputs=model.input, outputs=outputlayer)

        # Related objects.
        self.model = model
        if policy is None:
            policy = EpsGreedyQPolicy()
        if test_policy is None:
            test_policy = GreedyQPolicy()
        self.policy = policy
        self.test_policy = test_policy

        self.turn_left_agent = turn_left_agent
        self.go_straight_agent = go_straight_agent
        self.turn_right_agent = turn_right_agent

        # State.
        self.reset_states()
# -------------------------------------------------------------------------------------------

memory_file = os.path.join(variable_configs_folder, "memory.p")
memory = pickle.load(open(memory_file, "rb"))

model = build_convolutional_nn(all_configs["c_layers"],
                               all_configs["ff_layers"],
                               env.observation_space.shape, env.num_actions)
policy = LinearAnnealedPolicy(
    EpsGreedyQPolicy(masked_greedy=all_configs["masked_greedy"]),
    attr='eps',
    value_max=all_configs["max_eps"],
    value_min=all_configs["final_eps"],
    value_test=0.0,
    nb_steps=all_configs["exploration_fraction"])
test_policy = GreedyQPolicy(masked_greedy=True)

# ------------------------------------------------------------------------------------------

dqn = DQNAgent(model=model,
               nb_actions=env.num_actions,
               memory=memory,
               nb_steps_warmup=all_configs["learning_starts"],
               target_model_update=all_configs["target_network_update_freq"],
               policy=policy,
               test_policy=test_policy,
               gamma=all_configs["gamma"],
               enable_dueling_network=all_configs["dueling"])

dqn.compile(Adam(lr=all_configs["learning_rate"]))
x = Dense(30)(x)
x = Activation('tanh')(x)
x = Dense(20)(x)
x = Activation('tanh')(x)
x = Dense(nb_actions)(x)
x = Activation('linear')(x)
criticModel = Model(inputs=[action_input, observation_input], outputs=x)
print(criticModel.summary())

#setup policy and memory
memory = SequentialMemory(limit=50000, window_length=1)
#setup agent, using defined keras model alog with the policy and actions from above

#Discrete actions:
policy = EpsGreedyQPolicy()
testPolicy = GreedyQPolicy()
#agent = DQNAgent(model=actorModel, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, policy=policy, test_policy=testPolicy)

#continuous actions:
random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                          theta=.15,
                                          mu=0.,
                                          sigma=.3)
agent = DDPGAgent(actor=actorModel,
                  critic=criticModel,
                  nb_actions=nb_actions,
                  memory=memory,
                  nb_steps_warmup_actor=100,
                  nb_steps_warmup_critic=100,
                  critic_action_input=action_input,
                  random_process=random_process)
Exemple #18
0
class BetaFlapDQN(DQNAgent):
    def __init__(self, inputs, buffer, sess_id, sess, **kwargs):
        self.util = Utility()
        self.sess = sess
        self.sess_id = sess_id

        game = inputs['game']
        agnt = inputs['agent']
        sess = agnt['session']
        eps = sess['episode']
        mod = inputs['model']
        trn = mod['training']
        sv = mod['save']
        mem = inputs['memory']
        '''---Environment Paramters---'''
        self.env_name = game['name']
        self.fps = game['fps']
        self.mode = game['difficulty']
        self.target = game['target']
        self.tick = game['tick']
        '''---Episode Parameters---'''
        self.nb_episodes = sess['max_ep']
        self.nb_max_episode_steps = game['fps'] * 60 * eps['max_time']
        self.nb_steps = self.nb_max_episode_steps * self.nb_episodes
        self.nb_steps_warmup = trn['warmup']
        self.nb_max_start_steps = trn['max_ep_observe']
        self.max_start_steps = trn['warmup']
        self.keep_gif_score = eps['keep_gif_score']
        '''---Agent / Model Parameters---'''
        self.name = agnt['name']
        self.nb_actions = agnt['action_size']
        self.delta_clip = agnt['delta_clip']

        self.training = trn['training']
        self.verbose = trn['verbose']
        self.lr = trn['learn_rate']
        self.eps = trn['initial_epsilon']
        self.value_max = trn['initial_epsilon']
        self.value_min = trn['terminal_epsilon']
        self.anneal = trn['anneal']
        self.shuffle = trn['shuffle']
        self.train_interval = trn['interval']
        self.validate = trn['validate']
        self.split = trn['split']
        self.action_repetition = trn['action_repetition']
        self.epochs = trn['epochs']
        self.epoch = 1

        prec = km.binary_precision()
        re = km.binary_recall()
        f1 = km.binary_f1_score()
        self.metrics = ['accuracy', 'mse', prec, re, f1]
        self.H = mod['filter_size']
        self.alpha = mod['alpha']
        self.gamma = mod['gamma']
        self.momentum = mod['momentum']
        self.decay = mod['decay']
        self.target_model_update = mod['target_update']
        self.type = mod['type']
        self.enable_double_dqn = mod['double_dqn']
        self.enable_dueling_network = mod['dueling_network']
        self.dueling_type = mod['dueling_type']

        self.limit = mem['limit']
        self.batch_size = mem['batch_size']
        self.window_length = mem['state_size']
        self.memory_interval = mem['interval']

        self.ftype = sv['ftype']

        self.vizualize = sv['visualize']
        self.save_full = sv['save_full']
        self.save_weights = sv['save_weights']
        self.save_json = sv['save_json']
        self.save_plot = sv['save_plot']
        self.save_interval = sv['save_n']
        self.log_interval = sv['log_n']
        self.saves = sv['save_path']
        self.save_path = self.util.get_save_dir_struct(self.saves,
                                                       self.env_name)
        self.logs = sv['log_path']
        self.util.display_status('Hyperparameters Successfully Loaded')
        '''Reference/Excerpt:  keras-rl DQN Atari Example
        https://github.com/keras-rl/keras-rl/blob/master/examples/dqn_atari.py
        # Select a policy. 
        # We use eps-greedy action selection, which means that a random action
        # is selected with probability eps. We anneal eps from init to term over 
        # the course of (anneal) steps. This is done so that the agent initially 
        # explores the environment (high eps) and then gradually sticks to 
        # what it knows (low eps). We also set a dedicated eps value that is 
        # used during testing. Note that we set it to 0.05 so that the agent 
        # still performs some random actions. 
        # This ensures that the agent cannot get stuck.
        # '''
        self.custom_model_objects = {
            'S': self.window_length,
            'A': self.nb_actions,
            'H': self.H,
            'lr': self.lr,
            'name': self.name,
            'batch_size': self.batch_size,
            'sess': self.sess,
            #dueling_network=self.enable_dueling_network,
            #dueling_type=self.dueling_type,
        }

        with tf.device(gpu):
            self.policy = LinearAnnealedPolicy(
                inner_policy=EpsGreedyQPolicy(eps=self.value_max),
                attr='eps',
                value_max=self.value_max,
                value_min=self.value_min,
                value_test=self.alpha,
                nb_steps=self.anneal)
            self.test_policy = GreedyQPolicy()

            if mod['optimizer'].lower() == 'adamax':
                self.optimizer = Adamax(lr=self.lr)
            elif mod['optimizer'].lower() == 'adadelta':
                self.optimizer = Adadelta()
            elif mod['optimizer'].lower() == 'rmsprop':
                self.optimizer = RMSprop()
            elif mod['optimizer'].lower() == 'sgd':
                self.optimizer = SGD(
                    lr=self.lr,
                    momentum=self.momentum,
                    decay=self.decay,
                )
            else:
                self.optimizer = Adam(lr=self.lr)

        self.memory = buffer

        self.log_path = self.util.get_log_dir_struct(self.sess_id, self.logs,
                                                     self.ftype)

        self.util.display_status('Keras GPU Session {} Beginning'.format(
            self.sess_id))

        nn = NeuralNet(
            S=self.window_length,
            A=self.nb_actions,
            H=self.H,
            lr=self.lr,
            name=self.name,
            batch_size=self.batch_size,
            dueling_network=self.enable_dueling_network,
            dueling_type=self.dueling_type,
            sess=self.sess,
        )
        with tf.device(gpu):
            self.model = nn.get_model()

        self.util.display_status(
            '{} Keras Agent with {} Optimizer Built'.format(
                self.name, mod['optimizer']))
        '''---Compile the model with chosen optimizer
        loss is calculated with lamba function based on model
        type selections (dueling, or double dqn)'''
        with tf.device(gpu):
            self.compile(
                optimizer=self.optimizer,
                metrics=self.metrics,
            )

        self.util.display_status(
            '{} Agent Fully Initialized with Compiled Model'.format(self.name))

        super(BetaFlapDQN, self).__init__(
            model=self.model,
            nb_actions=self.nb_actions,
            memory=self.memory,
            policy=self.policy,
            test_policy=self.test_policy,
            enable_double_dqn=self.enable_double_dqn,
            enable_dueling_network=self.enable_dueling_network,
            dueling_type=self.dueling_type,
            **kwargs)

    def load_saved_model_weights(self):
        try:
            self.model.load_weights('saved/FlappyBird_weights.h5')
            self.util.display_status('Saved Keras Model Weights Loaded')
        except:
            self.util.display_status('No Saved Keras Model Weights Found')

    def fit(self, iteration=1, max_iteration=1):
        self.load_saved_model_weights()

        with tf.device(gpu):
            self.env = Environment(
                target_score=self.target,
                difficulty=self.mode,
                fps=self.fps,
                tick=self.tick,
            )
        self.util.display_status('{} Environment Emulation Initialized'.format(
            self.env_name))

        if self.action_repetition < 1:
            raise ValueError(
                'action_repetition must be >= 1, is {}'.\
                    format(self.action_repetition)
            )
        '''---Define Custom Callbacks and Processors BetaFlap'''
        FlappyCall = FlappySession()
        Flappy = FlappyProcessor()
        '''---Flag Agent with as Training with on_train_begin()'''
        self._on_train_begin()
        FlappyCall.on_train_begin()

        self.training = True
        observation = None
        reward = None
        done = False
        info = None
        status = 'play'
        episode = np.int16(0)
        self.step = np.int16(0)
        action = np.int16(0)
        self.randQ = np.int16(0)
        self.reward = np.float16(0)
        idx = np.int16(0)
        flap = False
        episode_reward = None
        episode_score = None
        episode_step = None
        did_abort = False
        '''---Begin stepping through Episodes---'''
        # continue while global step is < max session steps
        while self.step < self.nb_steps:
            gc.collect()
            if observation is None:  # new episode
                '''---Initialize Environment with No Action'''
                FlappyCall.on_episode_begin(episode)
                self.reset_states()  # reset all episode tracking parameters
                reward = None
                done = False
                info = {}
                action = None
                episode_step = np.int16(0)
                episode_score = np.int16(0)
                episode_reward = np.float32(0)

                wake = np.zeros([self.nb_actions])  # [0, 0]
                wake[0] = 1  # [1, 0] --> don't flap
                o, r, done, info = self.env.step(wake)  # progress env 1 frame
                observation, r = Flappy.process_step(o, r, done, info)
                assert observation is not None
                '''---Each episode, begin with n random actions/steps'''
                if self.nb_max_start_steps == 0:
                    self.nb_random_start_steps = 0
                else:
                    self.nb_random_start_steps = \
                    np.random.randint(self.nb_max_start_steps)
                '''---Perform random nb steps w/ rand action 
                      without adding them to experience replay memory'''
                for _ in range(self.nb_random_start_steps):
                    action = np.zeros([self.nb_actions])
                    randQ = rand.randrange(self.nb_actions)
                    action[randQ] = 1  # flag selected action
                    o, r, done, info = self.env.step(
                        action)  # progress env 1 frame
                    episode_step += 1
                    '''---Process output of randomized actions
                          without updating cumulative episode totals'''
                    observation = deepcopy(o)
                    observation, r = \
                        Flappy.process_step(observation, r, done, info)
                    if info['status'] == 'exit':
                        done = True
                        did_abort = True
                    if done: break
                # warmup period complete
                assert episode_reward is not None
                assert episode_step is not None
                assert observation is not None
                gc.collect()
            '''---Begin Iteratively Training Model Each Step
                * predict Q values / action (forward step)
                * use reward to improve the model (backward step)
            '''
            FlappyCall.on_step_begin(episode_step)
            '''---Predict Q Values Using Forward Method'''
            with tf.device(gpu):
                idx = self.forward(observation)
            action, flap = Flappy.process_action(idx, self.nb_actions)
            #episode_step += 1
            reward = np.float32(0)
            done = False
            for _ in range(self.action_repetition):
                o, r, d, i = self.env.step(action)
                observation = deepcopy(o)
                observation, r = Flappy.process_step(o, r, d, i)
                reward += r
                done = d
                info = i
                status = info['status']
                episode_step += 1
                if info['status'] == 'exit':
                    done = True
                    did_abort = True
                if done: break  # game over, end episode
            '''---Train the Model using Backward Method
            This function covers the bulk of the algorithm logic
                * store experience in memory
                * create experience batch, and predict Qs
                * train model on signle batch with selected optimizer
                * enable/disable double DQN or dueling network
                * update model target values
                * discount future reward and return model metrics
            '''
            with tf.device(gpu):
                metrics = self.backward(reward, terminal=done)
            episode_reward += reward
            self.reward = episode_reward
            episode_score = info['score']
            '''---Log Step Data---'''
            step_log = {
                'step': episode_step,  # track episode step nb
                'episode': episode,
                'metrics': metrics,
                'flap': flap,
                'action': action,
                'reward': reward,
                'done': done,
                'training': self.training,
                'q_values': self.q_values,
                'info': info,
                'x': o,
                'x_t': observation,
            }
            FlappyCall.on_step_end(episode_step, step_log)
            gc.collect()

            #episode_step += 1
            self.step += 1

            if (self.step % self.save_interval) == 0 \
            or status == 'save':
                self.save_model()
            if status == 'exit':
                done = True
                did_abort = True
            if self.nb_max_episode_steps and \
                (episode_step >= self.nb_max_episode_steps - 1):
                done = True  # max episode steps hit
            # We are in a terminal state but the agent hasn't yet seen it.
            # perform one more forward-backward call and ignore the action
            if done:
                with tf.device(gpu):
                    self.forward(observation)
                    self.backward(0., terminal=False)
                episode_log = {
                    'sess_id': self.sess_id,
                    'episode': episode,
                    'reward': episode_reward,
                    'score': episode_score,
                    'steps': episode_step,  # track global step nb   
                    'gif': self.keep_gif_score,
                    'log_path': self.logs,
                    'iteration': iteration,
                }
                '''Episode Complete, Proceed to Next Iteration'''
                FlappyCall.on_episode_end(episode, episode_log)

                episode += 1
                observation = None
                episode_step = None
                episode_reward = None
                episode_score = None
                gc.collect()

                if episode > self.nb_episodes or did_abort:
                    done = True  # max episode hit
                    break
        '''---Training Session Complete---'''
        self.save_model()
        session_log = {
            'id': self.sess_id,
            'nb_steps': self.step,
            'did_abort': did_abort
        }
        FlappyCall.on_train_end(session_log, self.sess_id, self.log_path)
        self._on_train_end()  # end training session
        if iteration >= max_iteration or did_abort:
            self.env.close()
            return True

    def forward(self, observation):
        # Select an action
        state = self.memory.get_recent_state(observation)
        with tf.device(gpu):
            self.q_values = self.compute_q_values(state)

        if self.training:  # LinearAnneal Greedy Epsilon
            with tf.device(gpu):
                action = self.policy.select_action(q_values=self.q_values)
        else:  #  GreedyQ
            with tf.device(gpu):
                action = self.test_policy.select_action(q_values=self.q_values)
        # Book-keeping for experience replay
        self.recent_observation = observation
        self.recent_action = action
        return action

    def backward(self, reward, terminal):
        '''Store latest step in experience replay tuple'''
        if self.step % self.memory_interval == 0 or self.reward > .011:
            if self.reward > .011:
                self.util.display_status(
                    'Step {} Replay Experience Memory Saved'.format(self.step))
            with tf.device(cpu):
                self.memory.append(np.array(self.recent_observation),
                                   np.int16(self.recent_action),
                                   np.float32(reward),
                                   terminal,
                                   training=self.training)
        metrics = []
        if not self.training:
            return metrics
        '''Begin Training on Batches of Stored Experiences'''
        if self.step > self.nb_steps_warmup \
        and self.step % self.train_interval == 0:
            with tf.device(gpu):
                batch = self.memory.sample(self.batch_size)
                assert len(batch) == self.batch_size

            state0_batch, reward_batch,action_batch, terminal1_batch, \
            state1_batch = \
                FlappyProcessor.process_state_batch(self, batch)

            assert reward_batch.shape == (self.batch_size, )
            assert terminal1_batch.shape == reward_batch.shape
            assert len(action_batch) == len(reward_batch)
            '''Compute the Q-Values for Mini-Batch of Samples
            "Deep Reinforcement Learning with Double Q-learning"
            (van Hasselt et al., 2015):
            Double DQN: 
                - online network predicts actions
                - target network estimates Q values.
            '''
            if self.enable_double_dqn:
                with tf.device(gpu):
                    q_values = self.model.predict_on_batch(state1_batch)
                assert q_values.shape == (self.batch_size, self.nb_actions)
                actions = np.argmax(q_values, axis=1)
                assert actions.shape == (self.batch_size, )
                # estimate Q values using the target network
                # select maxQ value with the online model (computed above)
                with tf.device(gpu):
                    target_q_values = \
                    self.target_model.predict_on_batch(state1_batch)

                assert target_q_values.shape == \
                    (self.batch_size, self.nb_actions)
                q_batch = target_q_values[range(self.batch_size), actions]
            # Compute the q_values for state1, compute maxQ of each sample
            # prediction done on target_model as outlined in Mnih (2015),
            # it makes the algorithm is significantly more stable
            else:
                with tf.device(gpu):
                    target_q_values = \
                    self.target_model.predict_on_batch(state1_batch)

                assert target_q_values.shape == \
                    (self.batch_size, self.nb_actions)
                q_batch = np.max(target_q_values, axis=1).flatten()
            assert q_batch.shape == (self.batch_size, )

            targets = np.zeros((self.batch_size, self.nb_actions))
            dummy_targets = np.zeros((self.batch_size, ))
            masks = np.zeros((self.batch_size, self.nb_actions))

            # Compute r_t + gamma * max_a Q(s_t+1, a)
            # update the affected output targets accordingly
            # Set discounted reward to zero for all states that were terminal
            discounted_reward_batch = self.gamma * q_batch
            discounted_reward_batch *= terminal1_batch
            assert discounted_reward_batch.shape == reward_batch.shape

            Rs = reward_batch + discounted_reward_batch
            for idx, (target, mask, R, action) in enumerate(
                    zip(targets, masks, Rs, action_batch)):
                target[action] = R  # update with estimated accumulated reward
                dummy_targets[idx] = R
                mask[action] = 1.  # enable loss for specific action
            targets = np.array(targets).astype('float32')
            masks = np.array(masks).astype('float32')
            '''Train Using Sample Experience Batch'''
            # perform a single update on the entire batch
            # use a dummy target, as loss is computed complex Lambda layer
            # still useful to know the target to compute metrics properly
            if type(self.model.input) is not list:
                ins = [state0_batch]
            else:
                state0_batch
            if self.validate:
                split = self.split
            else:
                split = 0

            with tf.device(gpu):
                metrics = self.trainable_model.train_on_batch(
                    ins + [targets, masks], [dummy_targets, targets])
                # THIS CAUSES A MEMORY LEAK IN CURRENT CONFIGURATION
                #metrics = self.trainable_model.fit(
                #    ins + [targets, masks],
                #    [dummy_targets, targets],
                #    batch_size=None,
                #    epochs=self.epochs,
                #    verbose=self.verbose,
                #    validation_split=split,
                #    shuffle=self.shuffle
                #)
                gc.collect()

            # throw away individual losses
            if type(metrics) is list:
                [m for idx, m in enumerate(metrics) if idx not in (1, 2)]
            else:
                metrics.history.update({'losses': self.policy.metrics})

        if self.target_model_update >= 1 \
        and self.step % self.target_model_update == 0:
            with tf.device(gpu):
                self.update_target_model_hard()
        return metrics

    def save_model(self):
        if self.save_full:
            '''---Save full model to single .h5 file---'''
            self.model.save(self.save_path + '_full.h5', overwrite=True)
            self.util.display_status('{} Model Saved to {}'.format(
                self.name, self.save_path + '_full.h5'))
        if self.save_weights:
            '''---Save model weights to separate .h5 file---'''
            self.model.save_weights(self.save_path + '_weights.h5',
                                    overwrite=True)
            self.util.display_status('{} Model Weights Saved to {}'.format(
                self.name, self.save_path + '_weights.h5'))
        if self.save_json:
            '''---Save model structure as JSON file---'''
            with open(self.save_path + '.json', 'a+') as f:
                json.dumps(self.model.to_json(), f)
            f.close()
            self.util.display_status('{} Model Structure Saved to {}'.format(
                self.name, self.save_path + '.json'))
        if self.save_plot:
            plot_model(self.model, to_file=self.save_path + '_flow.png')
            self.util.display_status(
                '{} Neural Network Diagram Saved to {}'.format(
                    self.name, self.save_path + '_flow.png'))
Exemple #19
0
 def __init__(self, model=None, policy=GreedyQPolicy()):
     """Initialisation of the groomer."""
     self.model = model
     self.policy = policy
Exemple #20
0
    def _build(self,
               model_fn,
               nb_states,
               memory=None,
               policy=None,
               test_policy=None,
               enable_double_dqn=True,
               enable_dueling_network=False,
               dueling_type='avg',
               gamma=0.99,
               inputs_batch_size=None,
               nb_steps_warmup=1000,
               train_interval=1,
               memory_interval=1,
               target_update=10000,
               delta_range=None,
               delta_clip=np.inf,
               scope="dqn",
               model_scope="model",
               target_model_scope="target_model",
               optimizer=tf.train.AdamOptimizer,
               inputs=None,
               inputs_class=DQNInputs,
               memory_max_length=100000,
               learning_rate=0.001,
               eps=0.1):

        self.memory = memory if memory else ExperienceReplay(
            4, max_length=memory_max_length)
        self.gamma = gamma
        self.nb_steps_warmup = nb_steps_warmup
        self.train_interval = train_interval
        self.memory_interval = memory_interval
        self.target_update = target_update
        self.delta_range = delta_range
        self.delta_clip = delta_clip

        self.inputs = inputs_class(
            nb_states, inputs_batch_size) if inputs is None else inputs
        self.global_step_update = self.global_step.assign_add(1)

        self.policy = policy(self) if policy else EpsGreedyQPolicy(self,
                                                                   eps=eps)
        self.test_policy = test_policy(
            self) if test_policy else GreedyQPolicy()

        with tf.variable_scope(model_scope):
            self.model = model_fn(self.inputs)
            self.model_variables = tf.get_collection(
                tf.GraphKeys.GLOBAL_VARIABLES, scope=model_scope)

        with tf.variable_scope(target_model_scope):
            self.target_model = model_fn(self.inputs)
            self.target_model_variables = tf.get_collection(
                tf.GraphKeys.GLOBAL_VARIABLES, scope=target_model_scope)

        with tf.variable_scope(scope):

            self.target_model_target = tf.where(
                self.inputs.done, self.inputs.r, self.inputs.r +
                self.gamma * tf.reduce_max(self.target_model.Qs, axis=1))

            self.model_Qsa = select_columns(
                self.model.Qs, self.inputs.a) if not hasattr(
                    self.model, "Qsa") else self.model.Qsa
            self.model_error = self.target_model_target - self.model_Qsa if not hasattr(
                self.model, "error") else self.model.error
            self.model_loss = (tf.reduce_mean)(
                (huber_loss)(self.model_error)) if not hasattr(
                    self.model, "loss") else self.model.loss
            self.model_learning_rate = self.model.learning_rate if hasattr(
                self.model, 'learning_rate') else learning_rate
            self.update = optimizer(self.model_learning_rate).minimize(
                self.model_loss, var_list=self.model_variables) if not hasattr(
                    self.model, "update") else self.model.update

            if self.target_update < 1:
                self.update = tf.group(
                    self.update, *[
                        tv.assign_add(self.target_update * (mv - tv))
                        for mv, tv in zip(self.target_model_variables,
                                          self.model_variables)
                    ])
                self.update_target_hard = None
            else:
                self.update_target_hard = tf.group(*[
                    tv.assign(mv) for mv, tv in zip(
                        self.target_model_variables, self.model_variables)
                ])
Exemple #21
0
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam
from rl.policy import EpsGreedyQPolicy, GreedyQPolicy, BoltzmannQPolicy
from rl.agents.dqn import DQNAgent
from rl.memory import EpisodeParameterMemory, SequentialMemory

sys.path.append(".")

from patternmatching.gray.incremental.query_call import load_graph, parse_args
from patternmatching.gray.incremental.rl_model import GraphEnv

logging.basicConfig(level=logging.INFO)

policies = {
    "bqp": BoltzmannQPolicy(),  # Unstable
    "gqp": GreedyQPolicy(),
    "egqp": EpsGreedyQPolicy(eps=0.1)  # eps should be around 0.1
}

window_length = 5  # Should be less than 20 (too large value will not converge Q-values)
memories = {
    "epm": EpisodeParameterMemory(limit=20,
                                  window_length=window_length),  # Non-episodic
    "sm": SequentialMemory(limit=20,
                           window_length=window_length)  # should use this
}

argv = sys.argv
if len(argv) < 4:
    print("Usage: python %s [ConfFile] [Policy] [Memory]" % argv[0])
    exit(1)
Exemple #22
0
import keras as K
from keras import layers
from keras.optimizers import Adam
import numpy as np
from PIL import Image
from rl.core import Processor
from rl.agents.dqn import DQNAgent
from rl.memory import SequentialMemory
from rl.policy import GreedyQPolicy
create_q_model = __import__('train').create_q_model
AtariProcessor = __import__('train').AtariProcessor

if __name__ == '__main__':
    """
    To run this on calling the method
    """
    env = gym.make('BreakoutNoFrameskip-v4')
    state = env.reset()
    actions = env.action_space.n
    model = K.models.load_model('policy.h5')
    memory = SequentialMemory(limit=1000000, window_length=4)
    policy = GreedyQPolicy()
    process = AtariProcessor()
    dqn = DQNAgent(model=model,
                   nb_actions=actions,
                   memory=memory,
                   policy=policy,
                   processor=process)
    dqn.compile(optimizer=Adam(lr=.00025, clipnorm=1.0), metrics=['mae'])
    dqn.test(env, nb_episodes=10, visualize=True)
def dqndef():
    # Get the environment and extract the number of actions.
    env = gym.make(args.env_name)
    np.random.seed(123)
    env.seed(123)
    nb_actions = env.action_space.n

    # Next, we build our model. We use the same model that was
    # described by Mnih et al. (2015).
    input_shape = (WINDOW_LENGTH, ) + INPUT_SHAPE
    model = Sequential()
    if K.image_dim_ordering() == 'tf':
        # (width, height, channels)
        model.add(Permute((2, 3, 1), input_shape=input_shape))
    elif K.image_dim_ordering() == 'th':
        # (channels, width, height)
        model.add(Permute((1, 2, 3), input_shape=input_shape))
    else:
        raise RuntimeError('Unknown image_dim_ordering.')

    model.add(Conv2D(32, (8, 8), strides=(4, 4)))
    model.add(Activation('relu'))
    model.add(Conv2D(64, (4, 4), strides=(2, 2)))
    model.add(Activation('relu'))
    model.add(Conv2D(64, (3, 3), strides=(1, 1)))
    model.add(Activation('relu'))
    model.add(Flatten())
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dense(nb_actions))
    model.add(Activation('linear'))

    print(model.summary())
    # print(model.output_shape)
    # Finally, we configure and compile our agent. You can use
    # every built-in Keras optimizer and even the metrics!
    memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)
    processor = AtariProcessor()

    # Select a policy. We use eps-greedy action selection, which means that
    # a random action is selected with probability eps.
    # We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done
    # so that the agent initially explores the environment (high eps) and
    # then gradually sticks to what it knows
    # (low eps). We also set a dedicated eps value that is used during testing.
    # Note that we set it to 0.05
    # so that the agent still performs some random actions. This ensures
    # that the agent cannot get stuck.

    policy = GreedyQPolicy()

    # The trade-off between exploration and exploitation is difficult
    # and an on-going research topic.
    # If you want, you can experiment with the parameters or use a
    # different policy. Another popular one
    # is Boltzmann-style exploration:
    # policy = BoltzmannQPolicy(tau=1.)
    # policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps',
    #                                  value_max=1., value_min=.1,
    #                                  value_test=.05, b_steps=1000000)
    # Feel free to give it a try!

    # print(model.output_shape)

    dqn = DQNAgent(model=model,
                   nb_actions=nb_actions,
                   policy=policy,
                   memory=memory,
                   processor=processor,
                   nb_steps_warmup=50000,
                   gamma=.99,
                   target_model_update=10000,
                   train_interval=4,
                   delta_clip=1.)

    dqn.compile(Adam(lr=.00025), metrics=['mae'])

    return dqn, env, args