コード例 #1
0
    def learn(self):
        args = easyDomains.convert(self.cmp, self.rewardSet, self.phi)
        rewardCandNum = len(self.rewardSet)
        k = config.NUMBER_OF_RESPONSES

        hValues = util.Counter()

        for sIdx in range(config.SAMPLE_TIMES):
            s = self.cmp.sampleState()
            indices = numpy.random.choice(range(rewardCandNum),
                                          k,
                                          replace=False)
            trajs = [
                self.sampleTrajFromRewardCandidate(idx, s) for idx in indices
            ]
            if any(len(u) < config.TRAJECTORY_LENGTH for u in trajs):
                continue

            for i in xrange(k):
                for j in xrange(k):
                    hValues[(
                        s, tuple(indices))] += self.cmp.getTrajectoryDistance(
                            trajs[i], trajs[j])
        maxH = max(hValues.values())
        maxStatesIndices = filter(lambda _: hValues[_] == maxH, hValues.keys())
        maxState, maxIndices = random.choice(maxStatesIndices)
        trajs = [
            self.sampleTrajFromRewardCandidate(idx, maxState)
            for idx in maxIndices
        ]
        return trajs, None
コード例 #2
0
    def learn(self):
        args = easyDomains.convert(self.cmp, self.rewardSet, self.phi)
        rewardCandNum = len(self.rewardSet)
        k = config.NUMBER_OF_RESPONSES

        hValues = util.Counter()

        for sIdx in range(config.SAMPLE_TIMES):
            s = self.cmp.sampleState()
            indices = numpy.random.choice(range(rewardCandNum),
                                          k,
                                          replace=False)
            trajs = [
                self.sampleTrajFromRewardCandidate(idx, s) for idx in indices
            ]
            if any(len(u) < config.TRAJECTORY_LENGTH for u in trajs):
                continue

            # compute the different between new psi and old psi
            psiProbs = self.getPossiblePhiAndProbs(trajs)
            for psi, prob in psiProbs:
                # note that we need to keep the information of which state to generate queries
                # and what reward candidates the policies are optimazing
                hValues[(s, tuple(indices))] += prob * sum(
                    abs(p1 - p2) for p1, p2 in zip(psi, self.phi))

        maxH = max(hValues.values())
        maxStatesIndices = filter(lambda _: hValues[_] == maxH, hValues.keys())
        maxState, maxIndices = random.choice(maxStatesIndices)
        trajs = [
            self.sampleTrajFromRewardCandidate(idx, maxState)
            for idx in maxIndices
        ]
        return trajs, None
コード例 #3
0
  def learn(self):
    args = easyDomains.convert(self.cmp, self.rewardSet, self.phi)
    rewardCandNum = len(self.rewardSet)

    if self.queryType == QueryType.DEMONSTRATION:
      k = config.NUMBER_OF_RESPONSES
    else:
      raise Exception("query type not implemented")

    # now q is a set of TRAJECTORIES
    q = []
    for i in range(k):
      if i == 0:
        args['maxV'] = [0] * rewardCandNum
      else:
        # find the optimal policy so far that achieves the best on each reward candidate
        args['maxV'] = []
        for rewardId in xrange(rewardCandNum):
          args['maxV'].append(max([self.computeV(pi, args['S'], args['A'], args['R'][rewardId], self.cmp.horizon) for pi in q]))
      x = lp.milp(**args)
      if self.heuristic:
        #TODO what to do with this x for demonstration purpose
        pass
      q.append(self.sampleTrajectory(x))
    
    objValue = self.getQValue(self.cmp.state, None, q)

    if self.queryType == QueryType.DEMONSTRATION:
      return q, None, objValue
    else:
      raise Exception("query type not implemented")
コード例 #4
0
    def learn(self):
        args = easyDomains.convert(self.cmp, self.rewardSet, self.phi)
        rewardCandNum = len(args['R'])
        horizon = self.cmp.horizon
        k = config.NUMBER_OF_RESPONSES

        maxV = -numpy.inf
        maxQ = None

        for iterIdx in range(config.SAMPLE_TIMES):
            q = []
            for i in xrange(k):
                theta = [
                    -0.5 + random.random() for _ in xrange(self.featLength)
                ]
                q.append(self.thetaToOccupancy(theta))

            maxVs = []
            for rewardId in xrange(rewardCandNum):
                maxVs.append(
                    max([
                        self.computeV(pi, args['S'], args['A'],
                                      args['R'][rewardId], horizon) for pi in q
                    ]))
            objValue = sum(maxVs[idx] * self.phi[idx]
                           for idx in range(rewardCandNum))

            if objValue > maxV:
                maxV = objValue
                maxQ = q

        return maxQ, None
コード例 #5
0
ファイル: trajAgents.py プロジェクト: shunzh/RLCodeBase
  def learn(self):
    args = easyDomains.convert(self.cmp, self.rewardSet, self.phi)
    rewardCandNum = len(self.rewardSet)
    k = config.NUMBER_OF_RESPONSES

    hValues = util.Counter()

    for sIdx in range(config.SAMPLE_TIMES):
      s = self.cmp.sampleState()
      indices = numpy.random.choice(range(rewardCandNum), k, replace=False)
      trajs = [self.sampleTrajFromRewardCandidate(idx, s) for idx in indices]
      if any(len(u) < config.TRAJECTORY_LENGTH for u in trajs):
        continue

      # compute the different between new psi and old psi
      psiProbs = self.getPossiblePhiAndProbs(trajs)
      for psi, prob in psiProbs:
        # note that we need to keep the information of which state to generate queries
        # and what reward candidates the policies are optimazing
        hValues[(s, tuple(indices))] += prob * sum(abs(p1 - p2) for p1, p2 in zip(psi, self.phi))

    maxH = max(hValues.values())
    maxStatesIndices = filter(lambda _: hValues[_] == maxH, hValues.keys())
    maxState, maxIndices = random.choice(maxStatesIndices)
    trajs = [self.sampleTrajFromRewardCandidate(idx, maxState) for idx in maxIndices]
    return trajs, None
コード例 #6
0
ファイル: trajAgents.py プロジェクト: shunzh/RLCodeBase
  def learn(self):
    args = easyDomains.convert(self.cmp, self.rewardSet, self.phi)
    k = config.NUMBER_OF_RESPONSES

    # find an arbitrary state to generate trajectory queries
    # make sure that the length of the query is TRAJECTORY_LENGTH
    while True:
      s = self.cmp.sampleState()
      q = [tuple(self.sampleTrajectory(None, s, hori=config.TRAJECTORY_LENGTH, to='trajectory')) for _ in xrange(k)]
      if any(len(u) < config.TRAJECTORY_LENGTH for u in q): continue
      else: break
    return q, None
コード例 #7
0
    def learn(self):
        rewardSet = [self.mdp.getReward]
        psi = [1]

        # the agent is certain on the reward functions
        args = easyDomains.convert(self.mdp, rewardSet, psi)
        args['maxV'] = [-numpy.inf]

        self.agent = PolicyGradientQueryAgent(self.mdp, rewardSet, psi,
                                              QueryType.POLICY, self.feat,
                                              self.featLength, self.discount)
        self.optPi = self.agent.findNextPolicy(**args)
        self.x = lambda s, a: self.optPi(s, a)

        return self.optPi
コード例 #8
0
    def learn(self):
        args = easyDomains.convert(self.cmp, self.rewardSet, self.phi)
        k = config.NUMBER_OF_RESPONSES

        # find an arbitrary state to generate trajectory queries
        # make sure that the length of the query is TRAJECTORY_LENGTH
        while True:
            s = self.cmp.sampleState()
            q = [
                tuple(
                    self.sampleTrajectory(None,
                                          s,
                                          hori=config.TRAJECTORY_LENGTH,
                                          to='trajectory')) for _ in xrange(k)
            ]
            if any(len(u) < config.TRAJECTORY_LENGTH for u in q): continue
            else: break
        return q, None
コード例 #9
0
ファイル: QTPAgent.py プロジェクト: shunzh/RLCodeBase
  def learn(self):
    args = easyDomains.convert(self.cmp, self.rewardSet, self.phi)
    rewardCandNum = len(args['R'])
    k = config.NUMBER_OF_RESPONSES
    assert k == 2 # not going to work for k > 2

    maxV = -numpy.inf
    maxQ = None
    for iterIdx in range(config.SAMPLE_TIMES):
      selector = [random.random() > .5 for _ in xrange(rewardCandNum)]
      # got two psis
      psi0 = [self.phi[_] if selector[_] else 0 for _ in xrange(rewardCandNum)]
      psi1 = [self.phi[_] if not selector[_] else 0 for _ in xrange(rewardCandNum)]
      agent0 = self.getFiniteVIAgent(psi0, self.cmp.horizon - self.cmp.getResponseTime(), self.cmp.terminalReward, posterior=True)
      agent1 = self.getFiniteVIAgent(psi1, self.cmp.horizon - self.cmp.getResponseTime(), self.cmp.terminalReward, posterior=True)
      
      v = agent0.getValue(self.cmp.state) + agent1.getValue(self.cmp.state) 
      if v > maxV:
        maxV = v
        maxQ = [agent0.x, agent1.x]

    return maxQ, None
コード例 #10
0
ファイル: trajAgents.py プロジェクト: shunzh/RLCodeBase
  def learn(self):
    args = easyDomains.convert(self.cmp, self.rewardSet, self.phi)
    rewardCandNum = len(self.rewardSet)
    k = config.NUMBER_OF_RESPONSES

    hValues = util.Counter()

    for sIdx in range(config.SAMPLE_TIMES):
      s = self.cmp.sampleState()
      indices = numpy.random.choice(range(rewardCandNum), k, replace=False)
      trajs = [self.sampleTrajFromRewardCandidate(idx, s) for idx in indices]
      if any(len(u) < config.TRAJECTORY_LENGTH for u in trajs):
        continue
   
      for i in xrange(k):
        for j in xrange(k):
          hValues[(s, tuple(indices))] += self.cmp.getTrajectoryDistance(trajs[i], trajs[j])
    maxH = max(hValues.values())
    maxStatesIndices = filter(lambda _: hValues[_] == maxH, hValues.keys())
    maxState, maxIndices = random.choice(maxStatesIndices)
    trajs = [self.sampleTrajFromRewardCandidate(idx, maxState) for idx in maxIndices]
    return trajs, None
コード例 #11
0
    def learn(self):
        args = easyDomains.convert(self.cmp, self.rewardSet, self.phi)
        rewardCandNum = len(args['R'])
        k = config.NUMBER_OF_RESPONSES
        assert k == 2  # not going to work for k > 2

        maxV = -numpy.inf
        maxQ = None
        for iterIdx in range(config.SAMPLE_TIMES):
            selector = [random.random() > .5 for _ in xrange(rewardCandNum)]
            # got two psis
            psi0 = [
                self.phi[_] if selector[_] else 0
                for _ in xrange(rewardCandNum)
            ]
            psi1 = [
                self.phi[_] if not selector[_] else 0
                for _ in xrange(rewardCandNum)
            ]
            agent0 = self.getFiniteVIAgent(psi0,
                                           self.cmp.horizon -
                                           self.cmp.getResponseTime(),
                                           self.cmp.terminalReward,
                                           posterior=True)
            agent1 = self.getFiniteVIAgent(psi1,
                                           self.cmp.horizon -
                                           self.cmp.getResponseTime(),
                                           self.cmp.terminalReward,
                                           posterior=True)

            v = agent0.getValue(self.cmp.state) + agent1.getValue(
                self.cmp.state)
            if v > maxV:
                maxV = v
                maxQ = [agent0.x, agent1.x]

        return maxQ, None
コード例 #12
0
    def learn(self):
        args = easyDomains.convert(self.cmp, self.rewardSet, self.phi)
        rewardCandNum = len(self.rewardSet)

        if self.queryType == QueryType.DEMONSTRATION:
            k = config.NUMBER_OF_RESPONSES
        else:
            raise Exception("query type not implemented")

        # now q is a set of TRAJECTORIES
        q = []
        for i in range(k):
            if i == 0:
                args['maxV'] = [0] * rewardCandNum
            else:
                # find the optimal policy so far that achieves the best on each reward candidate
                args['maxV'] = []
                for rewardId in xrange(rewardCandNum):
                    args['maxV'].append(
                        max([
                            self.computeV(pi, args['S'], args['A'],
                                          args['R'][rewardId],
                                          self.cmp.horizon) for pi in q
                        ]))
            x = lp.milp(**args)
            if self.heuristic:
                #TODO what to do with this x for demonstration purpose
                pass
            q.append(self.sampleTrajectory(x))

        objValue = self.getQValue(self.cmp.state, None, q)

        if self.queryType == QueryType.DEMONSTRATION:
            return q, None, objValue
        else:
            raise Exception("query type not implemented")
コード例 #13
0
  def learn(self):
    args = easyDomains.convert(self.cmp, self.rewardSet, self.phi)
    self.args = args # save a copy
    horizon = self.cmp.horizon
    terminalReward = self.cmp.terminalReward

    if self.queryType == QueryType.ACTION:
      k = len(args['A'])
    else:
      k = config.NUMBER_OF_RESPONSES

    # now q is a set of policy queries
    bestQ = None
    bestEUS = -numpy.inf
    
    # keep a copy of currently added policies. may not be used.
    # note that this is passing by inference
    
    # start with the prior optimal policy
    q = [self.getFiniteVIAgent(self.phi, horizon, terminalReward, posterior=True).x]
    args['q'] = q
    objValue = None # k won't be 1, fine

    # start adding following policies
    for i in range(1, k):
      if config.VERBOSE: print 'iter.', i
      x = self.findNextPolicy(**args)
      q.append(x)

      # query iteration
      # for each x \in q, what is q -> x; \psi? replace x with the optimal posterior policy
      if self.qi: q, objValue = self.queryIteration(args, q)

      args['q'] = q

    if self.queryType == QueryType.POLICY:
      # if asking policies directly, then return q
      #return q, objValue # THIS RETURNS EUS, NOT EPU
      return q, objValue
    if self.queryType == QueryType.PARTIAL_POLICY:
      idx = 0
      objValue = self.getQValue(self.cmp.state, None, q)
      qP = copy.copy(q)

      while True:
        # iterate over all the policies, remove one state pair of each
        # but make sure the EUS of the new set is unchaged
        x = qP[idx]
        xOld = x.copy()
        
        success = False
        for key in util.randomly(x.keys()):
          x.pop(key)
          print self.getQValue(self.cmp.state, None, qP), objValue 
          if self.getQValue(self.cmp.state, None, qP) == objValue:
            success = True
            break
          else:
            x = xOld.copy()
        
        if not success: break
        #print idx, len(x)
        idx = (idx + 1) % len(q)
      
      return qP
    elif self.queryType == QueryType.DEMONSTRATION:
      # if we already build a set of policies, but the query type is demonstration
      # we sample trajectories from these policies as a query
      # note that another way is implemented in MILPDemoAgent, which choose the next policy based on the demonstrated trajectories.
      qu = [self.sampleTrajectory(x) for x in q]
      return qu
    elif self.queryType in [QueryType.SIMILAR, QueryType.ACTION]:
      # implemented in a subclass, do nothing here
      pass
    else:
      raise Exception('Query type not implemented for MILP.')

    return args, q
コード例 #14
0
ファイル: QTPAgent.py プロジェクト: shunzh/RLCodeBase
  def learn(self):
    args = easyDomains.convert(self.cmp, self.rewardSet, self.phi)
    k = config.NUMBER_OF_RESPONSES
    responseTime = self.cmp.getResponseTime()
    horizon = self.cmp.horizon
    terminalReward = self.cmp.terminalReward
    from itertools import combinations

    rewardCandNum = len(self.rewardSet)
    maxObjValue = -numpy.inf
    optQ = None
    optPsis = None
    
    values = util.Counter()
    for i in xrange(1, rewardCandNum + 1):
      for l in combinations(range(rewardCandNum), i):
        l = [self.phi[i] if i in l else 0 for i in range(rewardCandNum)]
        if config.VERBOSE: print l
        agent = self.getFiniteVIAgent(l, horizon - responseTime, terminalReward, posterior=True)
        values[tuple(l)] = agent.getValue(self.cmp.state)
    
    for subset in combinations(values.items(), k):
      psis = map(lambda _: _[0], subset)
      qs = map(lambda _: _[1], subset)
      # make sure that such query partitions the reward candiates
      if sum(sum(_ > 0 for _ in psi) for psi in psis) == rewardCandNum and\
         all(sum(psi[i] for psi in psis) > 0 for i in xrange(rewardCandNum)):
        objValue = sum(qs)
        if objValue > maxObjValue:
          maxObjValue = objValue
          optQ = qs
          optPsis = psis
    
    q = None
    if self.queryType == QueryType.POLICY:
      return q, maxObjValue
    elif self.queryType == QueryType.ACTION:
      hList = []
      
      # FIXME has a problem here!
      policyBins = self.computeDominatingPis(args, q)

      for s in args['S']:
        hValue = 0
        for a in args['A']:
          resProb = 0
          bins = [0] * len(q)
          for idx in xrange(rewardCandNum):
            if a in self.viAgentSet[idx].getPolicies(s):
              # increase the probability of observing this 
              resProb += self.phi[idx]
              # put opt policies into bins
              bins = [sum(_) for _ in zip(bins, policyBins[idx])]

          hValue += resProb * scipy.stats.entropy(bins)

        hList.append((s, hValue))

      # sort them nondecreasingly
      hList = filter(lambda _: not scipy.isnan(_[1]), hList)
      hList = sorted(hList, key=lambda _: _[1])
      hList = hList[:1]
    else:
      raise Exception('Query type not implemented for MILP.')

    qList = []
    for q, h in hList:
      # FIXME ignored transient phase
      qValue = self.getQValue(self.cmp.state, None, q)
      qList.append((q, None, qValue))

    maxQValue = max(map(lambda _:_[2], qList))
    qList = filter(lambda _: _[2] == maxQValue, qList)

    return random.choice(qList)
コード例 #15
0
    def learn(self):
        args = easyDomains.convert(self.cmp, self.rewardSet, self.phi)
        k = config.NUMBER_OF_RESPONSES
        responseTime = self.cmp.getResponseTime()
        horizon = self.cmp.horizon
        terminalReward = self.cmp.terminalReward
        from itertools import combinations

        rewardCandNum = len(self.rewardSet)
        maxObjValue = -numpy.inf
        optQ = None
        optPsis = None

        values = util.Counter()
        for i in xrange(1, rewardCandNum + 1):
            for l in combinations(range(rewardCandNum), i):
                l = [
                    self.phi[i] if i in l else 0 for i in range(rewardCandNum)
                ]
                if config.VERBOSE: print l
                agent = self.getFiniteVIAgent(l,
                                              horizon - responseTime,
                                              terminalReward,
                                              posterior=True)
                values[tuple(l)] = agent.getValue(self.cmp.state)

        for subset in combinations(values.items(), k):
            psis = map(lambda _: _[0], subset)
            qs = map(lambda _: _[1], subset)
            # make sure that such query partitions the reward candiates
            if sum(sum(_ > 0 for _ in psi) for psi in psis) == rewardCandNum and\
               all(sum(psi[i] for psi in psis) > 0 for i in xrange(rewardCandNum)):
                objValue = sum(qs)
                if objValue > maxObjValue:
                    maxObjValue = objValue
                    optQ = qs
                    optPsis = psis

        q = None
        if self.queryType == QueryType.POLICY:
            return q, maxObjValue
        elif self.queryType == QueryType.ACTION:
            hList = []

            # FIXME has a problem here!
            policyBins = self.computeDominatingPis(args, q)

            for s in args['S']:
                hValue = 0
                for a in args['A']:
                    resProb = 0
                    bins = [0] * len(q)
                    for idx in xrange(rewardCandNum):
                        if a in self.viAgentSet[idx].getPolicies(s):
                            # increase the probability of observing this
                            resProb += self.phi[idx]
                            # put opt policies into bins
                            bins = [sum(_) for _ in zip(bins, policyBins[idx])]

                    hValue += resProb * scipy.stats.entropy(bins)

                hList.append((s, hValue))

            # sort them nondecreasingly
            hList = filter(lambda _: not scipy.isnan(_[1]), hList)
            hList = sorted(hList, key=lambda _: _[1])
            hList = hList[:1]
        else:
            raise Exception('Query type not implemented for MILP.')

        qList = []
        for q, h in hList:
            # FIXME ignored transient phase
            qValue = self.getQValue(self.cmp.state, None, q)
            qList.append((q, None, qValue))

        maxQValue = max(map(lambda _: _[2], qList))
        qList = filter(lambda _: _[2] == maxQValue, qList)

        return random.choice(qList)
コード例 #16
0
    def learn(self):
        args = easyDomains.convert(self.cmp, self.rewardSet, self.phi)
        self.args = args  # save a copy
        horizon = self.cmp.horizon
        terminalReward = self.cmp.terminalReward

        if self.queryType == QueryType.ACTION:
            k = len(args['A'])
        else:
            k = config.NUMBER_OF_RESPONSES

        # now q is a set of policy queries
        bestQ = None
        bestEUS = -numpy.inf

        # keep a copy of currently added policies. may not be used.
        # note that this is passing by inference

        # start with the prior optimal policy
        q = [
            self.getFiniteVIAgent(self.phi,
                                  horizon,
                                  terminalReward,
                                  posterior=True).x
        ]
        args['q'] = q
        objValue = None  # k won't be 1, fine

        # start adding following policies
        for i in range(1, k):
            if config.VERBOSE: print 'iter.', i
            x = self.findNextPolicy(**args)
            q.append(x)

            # query iteration
            # for each x \in q, what is q -> x; \psi? replace x with the optimal posterior policy
            if self.qi: q, objValue = self.queryIteration(args, q)

            args['q'] = q

        if self.queryType == QueryType.POLICY:
            # if asking policies directly, then return q
            #return q, objValue # THIS RETURNS EUS, NOT EPU
            return q, objValue
        if self.queryType == QueryType.PARTIAL_POLICY:
            idx = 0
            objValue = self.getQValue(self.cmp.state, None, q)
            qP = copy.copy(q)

            while True:
                # iterate over all the policies, remove one state pair of each
                # but make sure the EUS of the new set is unchaged
                x = qP[idx]
                xOld = x.copy()

                success = False
                for key in util.randomly(x.keys()):
                    x.pop(key)
                    print self.getQValue(self.cmp.state, None, qP), objValue
                    if self.getQValue(self.cmp.state, None, qP) == objValue:
                        success = True
                        break
                    else:
                        x = xOld.copy()

                if not success: break
                #print idx, len(x)
                idx = (idx + 1) % len(q)

            return qP
        elif self.queryType == QueryType.DEMONSTRATION:
            # if we already build a set of policies, but the query type is demonstration
            # we sample trajectories from these policies as a query
            # note that another way is implemented in MILPDemoAgent, which choose the next policy based on the demonstrated trajectories.
            qu = [self.sampleTrajectory(x) for x in q]
            return qu
        elif self.queryType in [QueryType.SIMILAR, QueryType.ACTION]:
            # implemented in a subclass, do nothing here
            pass
        else:
            raise Exception('Query type not implemented for MILP.')

        return args, q