def __init__(self):
     command = CritterbotSimulator.startSimulator()
     self.environment = CritterbotSimulator(command)
     self.latencyTimer = Chrono()
     self.rewards = self.createRewardFunction()
     self.actions = XYThetaAction.sevenActions()
     self.behaviourPolicy = RandomPolicy(Random(0), self.actions)
     self.representation = TileCodersNoHashing(self.environment.legend().nbLabels(), -2000, 2000)
     self.representation.includeActiveFeature()
     self.demonsScheduler = DemonScheduler()
     self.demons = []
     for rewardFunction in self.rewards:
         self.demons.append(self.createOffPolicyControlDemon(rewardFunction))
     self.x_t = None
     self.clock = zepy.clock("Horde Off-policy Control demons")
 def __init__(self):
     command = CritterbotSimulator.startSimulator()
     self.environment = CritterbotSimulator(command)
     self.latencyTimer = Chrono()
     self.rewards = self.createRewardFunction()
     self.actions = XYThetaAction.sevenActions()
     self.behaviourPolicy = RandomPolicy(Random(0), self.actions)
     self.representation = TileCodersNoHashing(self.environment.legend().nbLabels(), -2000, 2000)
     self.representation.includeActiveFeature()
     self.demons = []
     for rewardFunction in self.rewards:
         demon = self.createOnPolicyPredictionDemon(rewardFunction)
         self.demons.append(demon)
     self.horde = Horde()
     self.horde.demons().addAll(self.demons)
     self.horde.beforeFunctions().addAll(self.rewards)
     self.x_t = None
     self.clock = zepy.clock("Nexting Clock")
class DemonExperiment(object):
    Latency = 100 #s
    
    def __init__(self):
        command = CritterbotSimulator.startSimulator()
        self.environment = CritterbotSimulator(command)
        self.latencyTimer = Chrono()
        self.rewards = self.createRewardFunction()
        self.actions = XYThetaAction.sevenActions()
        self.behaviourPolicy = RandomPolicy(Random(0), self.actions)
        self.representation = TileCodersNoHashing(self.environment.legend().nbLabels(), -2000, 2000)
        self.representation.includeActiveFeature()
        self.demons = []
        for rewardFunction in self.rewards:
            demon = self.createOnPolicyPredictionDemon(rewardFunction)
            self.demons.append(demon)
        self.horde = Horde()
        self.horde.demons().addAll(self.demons)
        self.horde.beforeFunctions().addAll(self.rewards)
        self.x_t = None
        self.clock = zepy.clock("Nexting Clock")

    def createRewardFunction(self):
        legend = self.environment.legend()
        return list(SensorRewardFunction(legend, label) for label in legend.getLabels())

    def createOnPolicyPredictionDemon(self, rewardFunction):
        gamma = .9
        alpha = .1 / self.representation.vectorNorm()
        nbFeatures = self.representation.vectorSize()
        lambda_= .3
        td = TDLambda(lambda_, gamma, alpha, nbFeatures)
        return PredictionDemon(rewardFunction, td)
        
    def learn(self, a_t, o_tp1):
        x_tp1 = self.representation.project(o_tp1.doubleValues())
        self.horde.update(o_tp1, self.x_t, a_t, x_tp1)
        self.x_t = Vectors.bufferedCopy(x_tp1, self.x_t)
        
    def run(self):
        a_t = None
        while self.clock.tick():
            self.latencyTimer.start()
            o_tp1 = self.environment.waitNewRawObs()
            self.learn(a_t, o_tp1)
            self.behaviourPolicy.update(None)
            a_tp1 = self.behaviourPolicy.sampleAction()
            self.environment.sendAction(a_tp1)
            a_t = a_tp1
            waitingTime = self.Latency - self.latencyTimer.getCurrentMillis()
            if waitingTime > 0:
                time.sleep(waitingTime / 1000.0)
        self.environment.close()
                
    def zephyrize(self):
        zepy.advertise(self.clock, self.environment)
        zepy.advertise(self.clock, self.horde)
        for rewardFunction in self.rewards:
            zepy.monattr(self.clock, rewardFunction, 'rewardValue', label = rewardFunction.label)
class DemonExperiment(object):
    Latency = 100 #s
    
    def __init__(self):
        command = CritterbotSimulator.startSimulator()
        self.environment = CritterbotSimulator(command)
        self.latencyTimer = Chrono()
        self.rewards = self.createRewardFunction()
        self.actions = XYThetaAction.sevenActions()
        self.behaviourPolicy = RandomPolicy(Random(0), self.actions)
        self.representation = TileCodersNoHashing(self.environment.legend().nbLabels(), -2000, 2000)
        self.representation.includeActiveFeature()
        self.demons = []
        for rewardFunction in self.rewards:
            self.demons.append(self.createOffPolicyControlDemon(rewardFunction))
        self.horde = Horde()
        self.horde.demons().addAll(self.demons)
        self.horde.beforeFunctions().addAll(self.rewards)
        self.x_t = None
        self.clock = zepy.clock("Horde Off-policy Control demons")

    def createRewardFunction(self):
        legend = self.environment.legend()
        return [ SensorRewardFunction(legend, 'MotorCurrent0'),
                 SensorRewardFunction(legend, 'MotorCurrent1'),
                 SensorRewardFunction(legend, 'MotorCurrent2') ]

    def createOffPolicyControlDemon(self, rewardFunction):
        toStateAction = TabularAction(self.actions, self.representation.vectorNorm(), self.representation.vectorSize())
        nbFeatures = toStateAction.vectorSize()
        lambda_ = 0.1
        beta = .1
        alpha_v = .1 / toStateAction.vectorNorm()
        alpha_w = .001 / toStateAction.vectorNorm()
        gq = GQ(alpha_v, alpha_w, beta , lambda_, nbFeatures)
        targetPolicy = Greedy(gq, self.actions, toStateAction)
        controlGQ = GreedyGQ(gq, self.actions, toStateAction, targetPolicy, self.behaviourPolicy)
        return ControlOffPolicyDemon(rewardFunction, controlGQ)
        
    def learn(self, a_t, o_tp1):
        x_tp1 = self.representation.project(o_tp1.doubleValues())
        self.horde.update(o_tp1, self.x_t, a_t, x_tp1)
        self.x_t = Vectors.bufferedCopy(x_tp1, self.x_t)
        
    def run(self):
        a_t = None
        while self.clock.tick():
            self.latencyTimer.start()
            o_tp1 = self.environment.waitNewRawObs()
            self.learn(a_t, o_tp1)
            self.behaviourPolicy.update(None)
            a_tp1 = self.behaviourPolicy.sampleAction()
            self.environment.sendAction(a_tp1)
            a_t = a_tp1
            waitingTime = self.Latency - self.latencyTimer.getCurrentMillis()
            if waitingTime > 0:
                time.sleep(waitingTime / 1000.0)
        self.environment.close()
                
    def zephyrize(self):
        zepy.advertise(self.clock, self.environment)
        zepy.advertise(self.clock, self.horde)
        for rewardFunction in self.rewards:
            zepy.monattr(self.clock, rewardFunction, 'rewardValue', label = rewardFunction.label)
class DemonExperiment(object):
    Latency = 100 #s
    
    def __init__(self):
        command = CritterbotSimulator.startSimulator()
        self.environment = CritterbotSimulator(command)
        self.latencyTimer = Chrono()
        self.rewards = self.createRewardFunction()
        self.actions = XYThetaAction.sevenActions()
        self.behaviourPolicy = RandomPolicy(Random(0), self.actions)
        self.representation = TileCodersNoHashing(self.environment.legend().nbLabels(), -2000, 2000)
        self.representation.includeActiveFeature()
        self.demonsScheduler = DemonScheduler()
        self.demons = []
        for rewardFunction in self.rewards:
            targetPolicy = SingleActionPolicy(XYThetaAction.Left)
            demon = self.createOffPolicyPredictionDemon(rewardFunction, targetPolicy)
            self.demons.append(demon)
        self.x_t = None
        self.clock = zepy.clock("Horde Off-policy Predictions")

    def createRewardFunction(self):
        legend = self.environment.legend()
        return [ SensorRewardFunction(legend, 'MotorCurrent0'),
                 SensorRewardFunction(legend, 'MotorCurrent1'),
                 SensorRewardFunction(legend, 'MotorCurrent2') ]

    def createOffPolicyPredictionDemon(self, rewardFunction, targetPolicy):
        gamma = .9
        _lambda = .2
        alpha_v = .1 / self.representation.vectorNorm()
        alpha_w = .001 / self.representation.vectorNorm() 
        nbFeatures = self.representation.vectorSize()
        gtd = GTDLambda(_lambda, gamma, alpha_v, alpha_w, nbFeatures)
        return PredictionOffPolicyDemon(targetPolicy, self.behaviourPolicy, gtd, rewardFunction)
        
    def learn(self, a_t, o_tp1):
        for rewardFunction in self.rewards:
            rewardFunction.update(o_tp1)
        x_tp1 = self.representation.project(o_tp1)
        self.demonsScheduler.update(self.demons, self.x_t, a_t, x_tp1)
        self.x_t = x_tp1
        
    def run(self):
        a_t = None
        while self.clock.tick():
            self.latencyTimer.start()
            o_tp1 = self.environment.waitNewObs()
            self.learn(a_t, o_tp1)
            a_tp1 = self.behaviourPolicy.decide(None)
            self.environment.sendAction(a_tp1)
            a_t = a_tp1
            waitingTime = self.Latency - self.latencyTimer.getCurrentMillis()
            if waitingTime > 0:
                time.sleep(waitingTime / 1000.0)
        self.environment.close()
                
    def zephyrize(self):
        zepy.advertise(self.clock, self.environment)
        zepy.advertise(self.clock, self.demonsScheduler)
        for rewardFunction in self.rewards:
            zepy.monattr(self.clock, rewardFunction, 'rewardValue', label = rewardFunction.label)