Python softmax Examples

Programming Language: Python

Namespace/Package Name: DRL.utils

Method/Function: softmax

Examples at hotexamples.com: 2

Python softmax - 2 examples found. These are the top rated real world Python examples of DRL.utils.softmax extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: DQNPolicy_error.py Project: paulaWesselmann/testing_pydial

    def nextAction(self, beliefstate):
        '''
        select next action

        :param beliefstate:
        :param hyps:
        :returns: (int) next summary action
        '''
        if self.architecture != 'dip2':
            beliefVec = flatten_belief(beliefstate, self.domainUtil)
        else:
            dip_state = DIP_state(
                beliefstate.domainStates[beliefstate.currentdomain],
                self.domainString)
        execMask = self.summaryaction.getExecutableMask(
            beliefstate, self.lastSystemAction)

        if self.exploration_type == 'e-greedy':
            # epsilon greedy
            if self.is_training and utils.Settings.random.rand(
            ) < self.epsilon:
                admissible = [i for i, x in enumerate(execMask) if x == 0.0]
                random.shuffle(admissible)
                nextaIdex = admissible[0]
            else:
                if self.architecture != 'dip' and self.architecture != 'dip2':
                    action_Q = self.dqn.predict(
                        np.reshape(
                            beliefVec,
                            (1, len(beliefVec))))  # + (1. / (1. + i + j))
                    admissible = np.add(action_Q, np.array(execMask))
                    logger.info('action Q...')
                    #print admissible.shape
                    #print admissible
                    nextaIdex = np.argmax(admissible)

                    # add current max Q to self.episode_ave_max_q
                    #print 'current maxQ', np.max(admissible)
                    self.episode_ave_max_q.append(np.max(admissible))
                elif self.architecture == 'dip2':
                    admissible = []
                    for idx, v in enumerate(execMask):
                        action_name = self.actions.action_names[idx]
                        act_slot = 'general'
                        for slot in dip_state.slots:
                            if slot in action_name:
                                act_slot = slot
                        beliefVec = dip_state.get_beliefStateVec(act_slot)
                        action_Q = self.dqn.predict(
                            np.reshape(
                                beliefVec,
                                (1, len(beliefVec))))  # + (1. / (1. + i + j))
                        if v == 0:
                            admissible.append(action_Q[0][idx])
                        else:
                            admissible.append(v)
                    nextaIdex = np.argmax(admissible)
                    self.episode_ave_max_q.append(np.max(admissible))

                else:
                    admissible = []
                    for idx, v in enumerate(execMask):
                        if v > -sys.maxint:
                            Action_idx = np.eye(self.action_dim,
                                                self.action_dim)[[idx]]
                            Qidx = self.dqn.predict_dip(
                                np.reshape(beliefVec, (1, len(beliefVec))),
                                Action_idx)
                            #print 'argmax Q',Qidx[0]
                            admissible.append(Qidx[0])
                        else:
                            admissible.append(-sys.maxint)
                    # action_Q = self.dqn.predict(np.reshape(beliefVec, (1, len(beliefVec))))# + (1. / (1. + i + j))
                    # admissible = np.add(action_Q, np.array(execMask))
                    logger.info('action Q...')
                    #print admissible
                    nextaIdex = np.argmax(admissible)

                    # add current max Q to self.episode_ave_max_q
                    #print 'current maxQ', np.max(admissible)
                    self.episode_ave_max_q.append(np.max(admissible))

        elif self.exploration_type == 'Boltzman':
            # softmax
            if not self.is_training:
                self.epsilon = 0.001
            # self.epsilon here is served as temperature
            action_Q = self.dqn.predict(
                np.reshape(beliefVec,
                           (1, len(beliefVec))))  # + (1. / (1. + i + j))
            action_Q_admissible = np.add(action_Q, np.array(
                execMask))  # enforce Q of inadmissible actions to be -inf

            action_prob = drlutils.softmax(action_Q_admissible / self.epsilon)
            logger.info('action Q...')
            #print action_Q_admissible
            logger.info('action prob...')
            #print action_prob
            sampled_prob = np.random.choice(action_prob[0], p=action_prob[0])
            nextaIdex = np.argmax(action_prob[0] == sampled_prob)

        self.stats[nextaIdex] += 1
        summaryAct = self.action_names[nextaIdex]
        beliefstate = beliefstate.getDomainState(self.domainUtil.domainString)
        masterAct = self.summaryaction.Convert(beliefstate, summaryAct,
                                               self.lastSystemAction)
        return masterAct, nextaIdex

Example #2

Show file

    def nextAction(self, beliefstate):
        '''
        select next action

        :param beliefstate: 
        :param hyps:
        :returns: (int) next summary action
        '''
        beliefVec = flatten_belief(beliefstate, self.domainUtil)

        execMask = self.summaryaction.getExecutableMask(
            beliefstate, self.lastSystemAction)

        if self.exploration_type == 'e-greedy':

            action_prob = self.enac.predict_policy(
                np.reshape(beliefVec, (1, len(beliefVec))))
            admissibleCnt = [i for i, x in enumerate(execMask) if x == 0.0]
            admissible = np.add(action_prob, np.array(execMask))
            greedyNextaIdex = np.argmax(admissible)

            # epsilon greedy
            if self.is_training and utils.Settings.random.rand(
            ) < self.epsilon:
                admissible = [i for i, x in enumerate(execMask) if x == 0.0]
                random.shuffle(admissible)
                nextaIdex = admissible[0]

                # Importance sampling
                if nextaIdex == greedyNextaIdex:
                    self.mu_prob = self.epsilon / float(
                        self.action_dim) + 1 - self.epsilon
                else:
                    self.mu_prob = self.epsilon / float(self.action_dim)
            else:
                nextaIdex = greedyNextaIdex

                # add current max Q to self.episode_ave_max_q
                #print 'current maxQ', np.max(admissible)
                self.episode_ave_max_q.append(np.max(admissible))

                # Importance sampling
                self.mu_prob = self.epsilon / float(
                    self.action_dim) + 1 - self.epsilon

        elif self.exploration_type == 'Boltzman':
            # softmax
            if not self.is_training:
                self.epsilon = 0.001
            # self.epsilon here is served as temperature
            #action_prob, value = self.a2c.predict_action_value(np.reshape(beliefVec, (1, len(beliefVec))))# + (1. / (1. + i + j))
            action_prob = self.enac.predict_policy(
                np.reshape(beliefVec,
                           (1, len(beliefVec))))  # + (1. / (1. + i + j))
            action_Q_admissible = np.add(action_prob, np.array(
                execMask))  # enforce Q of inadmissible actions to be -inf

            action_prob = drlutils.softmax(action_Q_admissible / self.epsilon)
            logger.info('action Q...')
            print action_Q_admissible
            logger.info('action prob...')
            print action_prob
            sampled_prob = np.random.choice(action_prob[0], p=action_prob[0])
            nextaIdex = np.argmax(action_prob[0] == sampled_prob)

        self.stats[nextaIdex] += 1
        summaryAct = self.summaryaction.action_names[nextaIdex]
        beliefstate = beliefstate.getDomainState(self.domainUtil.domainString)
        masterAct = self.summaryaction.Convert(beliefstate, summaryAct,
                                               self.lastSystemAction)
        return masterAct, nextaIdex