Example #1
0
def combine_files():
    path = os.path.join("..", "..", "data", "delivery", "flat", "dataoutput_2")

    data = []
    for i in range(10):
        try:
            data += [HRLutils.load_data(path + ".%s.txt" % i)]
        except IOError:
            continue

    print "found %s files to combine" % len(data)
    print len(data[0]), "records"

    starttime = 0.0
    newdata = [[] for _ in data[0]]
    for d in data:
        if len(d) != len(newdata):
            print "uh oh, number of records is wrong"
            print len(d), len(newdata)
        for i, record in enumerate(d):
            for entry in record:
                newdata[i] += [[entry[0] + starttime, entry[1]]]
        starttime = newdata[0][-1][0]

    HRLutils.save_data(path + "_combined.txt", newdata)
Example #2
0
    def gen_encoders(self, N, contextD, context_scale):
        """Generate encoders for state population of learning agent.

        :param N: number of neurons in state population
        :param contextD: dimension of context vector representation
        :param context_scale: weight on context representation relative to
            state (1.0 = equal weighting)
        """

        if contextD > 0:
            contexts = MU.I(contextD)
        else:
            contexts = [[]]

        # neurons each sensitive to different combinations of stimuli
        encs = (list(MU.I(self.stateD)) +
                [o + s + c
                 for o in MU.I(self.num_orientations)
                 for s in MU.I(self.num_shapes)
                 for c in MU.I(self.num_colours)])

        return [HRLutils.normalize(
            HRLutils.normalize(random.choice(encs)) +
            [x * context_scale for x in random.choice(contexts)])
            for _ in range(N)]
Example #3
0
    def __init__(self, N, d, name="PositiveBias"):
        """Builds the PositiveBias network.

        :param N: base number of neurons
        :param d: dimension of input signal
        :param name: name for network
        """

        self.name = name
        net = nef.Network(self, seed=HRLutils.SEED, quick=False)

        tauPSC = 0.007
        biaslevel = 0.03  # the value to be output for negative inputs

        # threshold the input signal to detect positive values
        nfac = HRLutils.node_fac()
        nfac.setIntercept(IndicatorPDF(0, 0.1))
        neg_thresh = net.make_array("neg_thresh",
                                    N,
                                    d,
                                    encoders=[[1]],
                                    node_factory=nfac)
        neg_thresh.addDecodedTermination("input", MU.I(d), tauPSC, False)

        # create a population that tries to output biaslevel across
        # all dimensions
        bias_input = net.make_input("bias_input", [biaslevel])
        bias_pop = net.make_array(
            "bias_pop",
            N,
            d,
            node_factory=HRLutils.node_fac(),
            eval_points=[[x * 0.01] for x in range(0, biaslevel * 200)])

        net.connect(bias_input, bias_pop, pstc=tauPSC)

        # the individual dimensions of bias_pop are then inhibited by the
        # output of neg_thresh (so any positive values don't get the bias)
        net.connect(neg_thresh,
                    bias_pop,
                    pstc=tauPSC,
                    func=lambda x: [1.0] if x[0] > 0 else [0.0],
                    transform=[[-10 if i == k else 0 for k in range(d)]
                               for i in range(d)
                               for _ in range(bias_pop.getNeurons() / d)])

        # the whole population is inhibited by the learn signal, so that it
        # outputs 0 if the system isn't supposed to be learning
        bias_pop.addTermination("learn",
                                [[-10] for _ in range(bias_pop.getNeurons())],
                                tauPSC, False)

        self.exposeTermination(neg_thresh.getTermination("input"), "input")
        self.exposeTermination(bias_pop.getTermination("learn"), "learn")
        self.exposeOrigin(bias_pop.getOrigin("X"), "X")
Example #4
0
def optimal_run(seed=None):
    if seed is not None:
        HRLutils.set_seed(seed)
    seed = HRLutils.SEED

    net = nef.Network("optimal_run")

    actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]),
               ("left", [-1, 0])]

    env = deliveryenvironment.DeliveryEnvironment(
        actions,
        HRLutils.datafile("contextmap.bmp"),
        colormap={
            -16777216: "wall",
            -1: "floor",
            -256: "a",
            -2088896: "b"
        },
        imgsize=(5, 5),
        dx=0.001,
        placedev=0.5)
    net.add(env)

    class ActionRelay(nef.SimpleNode):
        def __init__(self):
            self.action = actions[0]

            nef.SimpleNode.__init__(self, "ActionRelay")

        def tick(self):
            pass

        def termination_action_in(self, x, dimensions=4):
            self.action = actions[x.index(max(x))]

        def origin_action_out(self):
            return self.action[1]

    em = ActionRelay()
    net.add(em)

    net.connect(env.getOrigin("optimal_move"), em.getTermination("action_in"))
    net.connect(em.getOrigin("action_out"), env.getTermination("action"))

    data = datanode.DataNode(period=5,
                             filename=HRLutils.datafile("dataoutput_%s.txt" %
                                                        seed))
    net.add(data)
    data.record(env.getOrigin("reward"))

    #     net.add_to_nengo()
    net.run(1000)
Example #5
0
    def tick(self):
        cond_active = False
        for c in self.conds:
            if isinstance(c, Timer):
                # if it is a timer entry, just update the timer and check if it
                # has expired
                c.tick()
                if c.ring():
                    self.reward = self.rewardval
                    self.activate()
                    c.reset()
                    cond_active = True

            elif (self.env.is_in(self.env.state, c) and
                  (self.conds[c] is None or
                   HRLutils.similarity(HRLutils.normalize(self.context),
                                       self.conds[c]) > 0.3)):
                # if it is a state entry, check if the agent is in the region
                # associated with that state, and check if that region is the
                # one corresponding to the currently selected context

                self.reward = self.rewardval

                self.rewardamount += 1
                if self.rewardamount > self.rewardresetamount:
                    self.activate()
                    self.rewardamount = 0

                cond_active = True

        # if no termination conditions met, just give default reward
        if not cond_active:
            self.reward = self.defaultreward

        # reset rewardamount when the reset signal is sent (so that there won't
        # be any leftover rewardamount from the agent's previous decision)
        if self.t > self.resettime[0] and self.t < self.resettime[1]:
            self.rewardamount = 0

        # add a penalty if the state hasn't changed (to help prevent agent from
        # getting stuck)
        if sum(self.prev_state) != 0 and \
                HRLutils.similarity(HRLutils.normalize(self.env.state),
                                    HRLutils.normalize(self.prev_state)) < 1.0:
            self.state_penalty = 0.0
        else:
            self.state_penalty += 0.0001
        self.prev_state = copy.deepcopy(self.env.state)

        self.reward = self.reward - self.state_penalty
Example #6
0
    def tick(self):
        cond_active = False
        for c in self.conds:
            if isinstance(c, Timer):
                # if it is a timer entry, just update the timer and check if it
                # has expired
                c.tick()
                if c.ring():
                    self.reward = self.rewardval
                    self.activate()
                    c.reset()
                    cond_active = True

            elif (self.env.is_in(self.env.state, c)
                  and (self.conds[c] is None or HRLutils.similarity(
                      HRLutils.normalize(self.context), self.conds[c]) > 0.3)):
                # if it is a state entry, check if the agent is in the region
                # associated with that state, and check if that region is the
                # one corresponding to the currently selected context

                self.reward = self.rewardval

                self.rewardamount += 1
                if self.rewardamount > self.rewardresetamount:
                    self.activate()
                    self.rewardamount = 0

                cond_active = True

        # if no termination conditions met, just give default reward
        if not cond_active:
            self.reward = self.defaultreward

        # reset rewardamount when the reset signal is sent (so that there won't
        # be any leftover rewardamount from the agent's previous decision)
        if self.t > self.resettime[0] and self.t < self.resettime[1]:
            self.rewardamount = 0

        # add a penalty if the state hasn't changed (to help prevent agent from
        # getting stuck)
        if sum(self.prev_state) != 0 and \
                HRLutils.similarity(HRLutils.normalize(self.env.state),
                                    HRLutils.normalize(self.prev_state)) < 1.0:
            self.state_penalty = 0.0
        else:
            self.state_penalty += 0.0001
        self.prev_state = copy.deepcopy(self.env.state)

        self.reward = self.reward - self.state_penalty
Example #7
0
    def __init__(self, N, d, name="PositiveBias"):
        """Builds the PositiveBias network.

        :param N: base number of neurons
        :param d: dimension of input signal
        :param name: name for network
        """

        self.name = name
        net = nef.Network(self, seed=HRLutils.SEED, quick=False)

        tauPSC = 0.007
        biaslevel = 0.03  # the value to be output for negative inputs

        # threshold the input signal to detect positive values
        nfac = HRLutils.node_fac()
        nfac.setIntercept(IndicatorPDF(0, 0.1))
        neg_thresh = net.make_array("neg_thresh", N, d, encoders=[[1]],
                                    node_factory=nfac)
        neg_thresh.addDecodedTermination("input", MU.I(d), tauPSC, False)

        # create a population that tries to output biaslevel across
        # all dimensions
        bias_input = net.make_input("bias_input", [biaslevel])
        bias_pop = net.make_array("bias_pop", N, d,
                                  node_factory=HRLutils.node_fac(),
                                  eval_points=[[x * 0.01] for x in
                                               range(0, biaslevel * 200)])

        net.connect(bias_input, bias_pop, pstc=tauPSC)

        # the individual dimensions of bias_pop are then inhibited by the
        # output of neg_thresh (so any positive values don't get the bias)
        net.connect(neg_thresh, bias_pop, pstc=tauPSC,
                    func=lambda x: [1.0] if x[0] > 0 else [0.0],
                    transform=[[-10 if i == k else 0 for k in range(d)]
                               for i in range(d) for _ in
                               range(bias_pop.getNeurons() / d)])

        # the whole population is inhibited by the learn signal, so that it
        # outputs 0 if the system isn't supposed to be learning
        bias_pop.addTermination("learn", [[-10] for _ in
                                          range(bias_pop.getNeurons())],
                                tauPSC, False)

        self.exposeTermination(neg_thresh.getTermination("input"), "input")
        self.exposeTermination(bias_pop.getTermination("learn"), "learn")
        self.exposeOrigin(bias_pop.getOrigin("X"), "X")
Example #8
0
    def calc_optimal_move(self):
        """Calculate the optimal move for the agent to take in the current
        state/context."""

        # basically the same as PlaceCellEnvironment.calc_optimal_move, except
        # we look at the current context to find the goal

        goal = [c for c in self.contexts
                if self.contexts[c] == self.context][0]

        stepsize = 0.1
        self.optimal_move = None
        for y in [v * stepsize for v in range(int(-self.imgsize[1] /
                                                  (2 * stepsize)) + 1,
                                              int(self.imgsize[1] /
                                                  (2 * stepsize)) - 1)]:
            for x in [v * stepsize for v in range(int(-self.imgsize[0] /
                                                      (2 * stepsize)) + 1,
                                                  int(self.imgsize[0] /
                                                      (2 * stepsize)) - 1)]:
                if self.is_in((x, y), goal):
                    angle = math.atan2(y - self.state[1], x - self.state[0])
                    pt = (math.cos(angle), math.sin(angle))
                    self.optimal_move = max(
                        self.actions, key=lambda x:-1 if
                        self.is_in((x[1][0] * self.dx + self.state[0],
                                    x[1][1] * self.dx + self.state[1]),
                                   "wall")
                        else HRLutils.similarity(x[1], pt))[0]
                    return
    def calc_optimal_move(self):
        """Calculate the optimal move for the agent to take in the current
        state/context."""

        # basically the same as PlaceCellEnvironment.calc_optimal_move, except
        # we look at whether or not we have the package to pick a goal state

        stepsize = 0.1
        self.optimal_move = None
        for y in [
                v * stepsize for v in range(
                    int(-self.imgsize[1] / (2 * stepsize)) + 1,
                    int(self.imgsize[1] / (2 * stepsize)) - 1)
        ]:
            for x in [
                    v * stepsize for v in range(
                        int(-self.imgsize[0] / (2 * stepsize)) + 1,
                        int(self.imgsize[0] / (2 * stepsize)) - 1)
            ]:
                if ((self.is_in((x, y), "a") and not self.in_hand)
                        or (self.is_in((x, y), "b") and self.in_hand)):
                    angle = math.atan2(y - self.state[1], x - self.state[0])
                    pt = (math.cos(angle), math.sin(angle))
                    self.optimal_move = max(
                        self.actions,
                        key=lambda x: -1
                        if self.is_in((x[1][0] * self.dx + self.state[0], x[1][
                            1] * self.dx + self.state[1]), "wall"
                                      ) else HRLutils.similarity(x[1], pt))[0]

                    return
Example #10
0
    def calc_optimal_move(self):
        """Calculates the optimal move for the agent to make in the current state.

        Used for debugging mainly.
        """

        # grid search the image with the given stepsize
        stepsize = 0.1
        self.optimal_move = None
        for y in [v * stepsize for v in range(int(-self.imgsize[1] / (2 * stepsize)) + 1,
                                            int(self.imgsize[1] / (2 * stepsize)) - 1)]:
            for x in [v * stepsize for v in range(int(-self.imgsize[0] / (2 * stepsize)) + 1,
                                                int(self.imgsize[0] / (2 * stepsize)) - 1)]:
                # if the pt you're looking at is in the region you're looking for
                if self.is_in((x, y), "target"):
                    # generate a target point in the direction from current location to target
                    angle = math.atan2(y - self.state[1], x - self.state[0])
                    pt = (math.cos(angle), math.sin(angle))

                    # pick the action that is closest to the target point
                    # note: penalize actions that would involve moving through a wall
                    self.optimal_move = max(self.actions, key=lambda x:-1
                                            if self.is_in((x[1][0] * self.dx + self.state[0],
                                                           x[1][1] * self.dx + self.state[1]),
                                                          "wall")
                                            else HRLutils.similarity(x[1], pt))[0]
                    return
Example #11
0
    def calc_optimal_move(self):
        """Calculate the optimal move for the agent to take in the current
        state/context."""

        # basically the same as PlaceCellEnvironment.calc_optimal_move, except
        # we look at the current context to find the goal

        goal = [c for c in self.contexts
                if self.contexts[c] == self.context][0]

        stepsize = 0.1
        self.optimal_move = None
        for y in [
                v * stepsize for v in range(
                    int(-self.imgsize[1] / (2 * stepsize)) + 1,
                    int(self.imgsize[1] / (2 * stepsize)) - 1)
        ]:
            for x in [
                    v * stepsize for v in range(
                        int(-self.imgsize[0] / (2 * stepsize)) + 1,
                        int(self.imgsize[0] / (2 * stepsize)) - 1)
            ]:
                if self.is_in((x, y), goal):
                    angle = math.atan2(y - self.state[1], x - self.state[0])
                    pt = (math.cos(angle), math.sin(angle))
                    self.optimal_move = max(
                        self.actions,
                        key=lambda x: -1
                        if self.is_in((x[1][0] * self.dx + self.state[0], x[1][
                            1] * self.dx + self.state[1]), "wall"
                                      ) else HRLutils.similarity(x[1], pt))[0]
                    return
Example #12
0
    def calc_optimal_move(self):
        """Calculate the optimal move for the agent to take in the current
        state/context."""

        # basically the same as PlaceCellEnvironment.calc_optimal_move, except
        # we look at whether or not we have the package to pick a goal state

        stepsize = 0.1
        self.optimal_move = None
        for y in [v * stepsize for v in
                  range(int(-self.imgsize[1] / (2 * stepsize)) + 1,
                        int(self.imgsize[1] / (2 * stepsize)) - 1)]:
            for x in [v * stepsize for v in
                      range(int(-self.imgsize[0] / (2 * stepsize)) + 1,
                            int(self.imgsize[0] / (2 * stepsize)) - 1)]:
                if ((self.is_in((x, y), "a") and not self.in_hand) or
                        (self.is_in((x, y), "b") and self.in_hand)):
                    angle = math.atan2(y - self.state[1], x - self.state[0])
                    pt = (math.cos(angle), math.sin(angle))
                    self.optimal_move = max(
                        self.actions, key=lambda x:-1
                        if self.is_in((x[1][0] * self.dx + self.state[0],
                                       x[1][1] * self.dx + self.state[1]),
                                      "wall")
                        else HRLutils.similarity(x[1], pt))[0]

                    return
Example #13
0
def test_terminationnode():
    net = nef.Network("testTerminationNode")

    actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]), ("left", [-1, 0])]
    env = deliveryenvironment.DeliveryEnvironment(
        actions,
        HRLutils.datafile("contextmap.bmp"),
        colormap={-16777216: "wall", -1: "floor", -256: "a", -2088896: "b"},
        imgsize=(5, 5),
        dx=0.001,
        placedev=0.5,
    )
    net.add(env)

    term_node = terminationnode.TerminationNode(
        {"a": [0, 1], "b": [1, 0], terminationnode.Timer((30, 30)): None}, env, contextD=2, rewardval=1
    )
    net.add(term_node)

    print term_node.conds

    context_input = net.make_input("contextinput", {0.0: [0, 0.1], 0.5: [1, 0], 1.0: [0, 1]})
    net.connect(context_input, term_node.getTermination("context"))

    net.add_to_nengo()
    net.view()
Example #14
0
    def __init__(self,
                 actions,
                 mapname,
                 colormap,
                 name="PlaceCellEnvironment",
                 imgsize=(1.0, 1.0),
                 dx=0.01,
                 placedev=0.1,
                 num_places=None):
        """Initialize environment variables.

        :param actions: actions available to the system
            :type actions: list of tuples (action_name,action_vector)
        :param mapname: name of file describing environment map
        :param colormap: dict mapping pixel colours to labels
        :param name: name for environment
        :param imgsize: width of space represented by the map image
        :param dx: distance agent moves each timestep
        :param placedev: standard deviation of gaussian place cell activations
        :param num_places: number of placecells to use (if None it will attempt
            to fill the space)
        """

        EnvironmentTemplate.__init__(self, name, 2, actions)

        # parameters
        self.colormap = colormap
        self.rewardamount = 0  # number of timesteps spent in reward

        # number of timesteps to spend in reward before agent is reset
        # note: convenient to express this as time_in_reward / dt
        self.rewardresetamount = 0.6 / 0.001

        self.num_actions = len(actions)
        self.imgsize = [float(x) for x in imgsize]
        self.dx = dx
        self.placedev = placedev
        self.num_places = num_places
        self.optimal_move = None
        self.defaultreward = -0.075

        # load environment
        self.map = ImageIO.read(File(HRLutils.datafile(mapname)))

        # generate place cells
        self.gen_placecells(min_spread=1.0 * placedev)

        # initial conditions
        self.state = self.random_location(avoid=["wall", "target"])
        self.place_activations = [0 for _ in self.placecells]

        self.create_origin("place", lambda: self.place_activations)

        # note: making the value small, so that the noise node will give us
        # some random exploration as well
        self.create_origin(
            "optimal_move", lambda:
            [0.1 if self.optimal_move == a[0] else 0.0 for a in self.actions])
Example #15
0
def run_gridworld(args, seed=None):

    if seed is not None:
        HRLutils.set_seed(seed)
    seed = HRLutils.SEED

    net = nef.Network("run_gridworld")

    stateN = 400
    stateD = 2
    actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]),
               ("left", [-1, 0])]

    agent = smdpagent.SMDPAgent(stateN, stateD, actions, stateradius=3, **args)
    net.add(agent)

    env = gridworldenvironment.GridWorldEnvironment(
        stateD,
        actions,
        HRLutils.datafile("potjansgrid.txt"),
        cartesian=True,
        delay=(0.6, 0.9),
        datacollection=False)
    net.add(env)

    net.connect(env.getOrigin("state"), agent.getTermination("state_input"))
    net.connect(env.getOrigin("reward"), agent.getTermination("reward"))
    net.connect(env.getOrigin("reset"), agent.getTermination("reset"))
    net.connect(env.getOrigin("learn"), agent.getTermination("learn"))
    net.connect(env.getOrigin("reset"), agent.getTermination("save_state"))
    net.connect(env.getOrigin("reset"), agent.getTermination("save_action"))

    net.connect(agent.getOrigin("action_output"), env.getTermination("action"))
    net.connect(agent.getOrigin("Qs"), env.getTermination("Qs"))

    # net.add_to_nengo()
    # view = timeview.View(net.network, update_frequency=5)
    # view.add_watch(gridworldwatch.GridWorldWatch())
    # view.restore()

    net.network.simulator.run(0, 1000, 0.001)

    print "latencies"
    print len(env.latencies)
    print env.latencies
Example #16
0
    def saveParams(self, prefix):
        #save connection weights
        if self.neuron_learning:
            self.getNode("actionvals").saveWeights(prefix)
            self.getNode("old_actionvals").saveWeights(prefix)
        else:
            dec = self.getNode("state_pop").getOrigin("vals").getDecoders()
            with open(HRLutils.datafile(prefix + "_state_decoders.txt"), "w") as f:
                f.write("\n".join([" ".join([str(x) for x in d]) for d in dec]))

            dec = self.getNode("old_state_pop").getOrigin("vals").getDecoders()
            with open(HRLutils.datafile(prefix + "_old_state_decoders.txt"), "w") as f:
                f.write("\n".join([" ".join([str(x) for x in d]) for d in dec]))

        #save state encoders
        enc = self.getNode("state_pop").getEncoders()
        with open(HRLutils.datafile(prefix + "_state_encoders.txt"), "w") as f:
            f.write("\n".join([" ".join([str(x) for x in e]) for e in enc]))
Example #17
0
    def loadParams(self, prefix):
        print "loading params: %s" % prefix

        #load connection weights
        if self.neuron_learning:
            self.getNode("actionvals").loadWeights(prefix)
            self.getNode("old_actionvals").loadWeights(prefix)
        else:
            with open(HRLutils.datafile(prefix + "_state_decoders.txt")) as f:
                self.getNode("state_pop").getOrigin("vals").setDecoders(
                            [[float(x) for x in d.split(" ")] for d in f.readlines()])

            with open(HRLutils.datafile(prefix + "_old_state_decoders.txt")) as f:
                self.getNode("old_state_pop").getOrigin("vals").setDecoders(
                            [[float(x) for x in d.split(" ")] for d in f.readlines()])

        #load state encoders
        with open(HRLutils.datafile(prefix + "_state_encoders.txt")) as f:
            enc = [[float(x) for x in e.split(" ")] for e in f.readlines()]
        self.getNode("state_pop").setEncoders(enc)
        self.getNode("old_state_pop").setEncoders(enc) #note we assume that state_pop and old_state_pop use the same encoders
Example #18
0
def test_bmp():
    from javax.imageio import ImageIO
    from java.io import File

    img = ImageIO.read(File(HRLutils.datafile("contextmap.bmp")))

    colours = [int(val) for val in img.getRGB(0, 0, img.getWidth(), img.getHeight(), None, 0, img.getWidth())]
    unique_colours = []
    for c in colours:
        if c not in unique_colours:
            unique_colours += [c]

    print unique_colours
Example #19
0
    def saveParams(self, prefix):
        # save connection weights
        if self.neuron_learning:
            self.getNode("actionvals").saveWeights(prefix)
            self.getNode("old_actionvals").saveWeights(prefix)
        else:
            dec = self.getNode("state_pop").getOrigin("vals").getDecoders()
            with open(HRLutils.datafile(prefix + "_state_decoders.txt"),
                      "w") as f:
                f.write("\n".join([" ".join([str(x) for x in d])
                                   for d in dec]))

            dec = self.getNode("old_state_pop").getOrigin("vals").getDecoders()
            with open(HRLutils.datafile(prefix + "_old_state_decoders.txt"),
                      "w") as f:
                f.write("\n".join([" ".join([str(x) for x in d])
                                   for d in dec]))

        # save state encoders
        enc = self.getNode("state_pop").getEncoders()
        with open(HRLutils.datafile(prefix + "_state_encoders.txt"), "w") as f:
            f.write("\n".join([" ".join([str(x) for x in e]) for e in enc]))
Example #20
0
def test_actionvalues():
    net = nef.Network("testActionValues")

    stateN = 200
    N = 100
    stateD = 2
    stateradius = 1.0
    statelength = math.sqrt(2 * stateradius**2)
    init_Qs = 0.5
    learningrate = 0.0
    Qradius = 1
    tauPSC = 0.007
    actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]),
               ("left", [-1, 0])]

    # state
    state_pop = net.make(
        "state_pop",
        stateN,
        stateD,
        radius=statelength,
        node_factory=HRLutils.node_fac(),
        eval_points=[[x / statelength, y / statelength]
                     for x in range(-int(stateradius), int(stateradius))
                     for y in range(-int(stateradius), int(stateradius))])
    state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])
    state_pop.addDecodedTermination("state_input", MU.I(stateD), tauPSC, False)

    # set up action nodes
    decoders = state_pop.addDecodedOrigin("init_decoders",
                                          [ConstantFunction(stateD, init_Qs)],
                                          "AXON").getDecoders()

    actionvals = actionvalues.ActionValues("testActionValues",
                                           N,
                                           stateN,
                                           actions,
                                           learningrate,
                                           Qradius=Qradius,
                                           init_decoders=decoders)
    net.add(actionvals)

    net.connect(state_pop.getOrigin("AXON"),
                actionvals.getTermination("state"))

    # input
    inp = net.make_input("input", [0, 0])
    net.connect(inp, state_pop.getTermination("state_input"))

    net.add_to_nengo()
    net.view()
Example #21
0
    def __init__(self, actions, mapname, colormap, name="PlaceCellEnvironment",
                 imgsize=(1.0, 1.0), dx=0.01, placedev=0.1, num_places=None):
        """Initialize environment variables.

        :param actions: actions available to the system
            :type actions: list of tuples (action_name,action_vector)
        :param mapname: name of file describing environment map
        :param colormap: dict mapping pixel colours to labels
        :param name: name for environment
        :param imgsize: width of space represented by the map image
        :param dx: distance agent moves each timestep
        :param placedev: standard deviation of gaussian place cell activations
        :param num_places: number of placecells to use (if None it will attempt
            to fill the space)
        """

        EnvironmentTemplate.__init__(self, name, 2, actions)

        # parameters
        self.colormap = colormap
        self.rewardamount = 0  # number of timesteps spent in reward

        # number of timesteps to spend in reward before agent is reset
        # note: convenient to express this as time_in_reward / dt
        self.rewardresetamount = 0.6 / 0.001

        self.num_actions = len(actions)
        self.imgsize = [float(x) for x in imgsize]
        self.dx = dx
        self.placedev = placedev
        self.num_places = num_places
        self.optimal_move = None
        self.defaultreward = -0.075

        # load environment
        self.map = ImageIO.read(File(HRLutils.datafile(mapname)))

        # generate place cells
        self.gen_placecells(min_spread=1.0 * placedev)

        # initial conditions
        self.state = self.random_location(avoid=["wall", "target"])
        self.place_activations = [0 for _ in self.placecells]

        self.create_origin("place", lambda: self.place_activations)

        # note: making the value small, so that the noise node will give us
        # some random exploration as well
        self.create_origin("optimal_move",
                           lambda: [0.1 if self.optimal_move == a[0] else 0.0
                                    for a in self.actions])
Example #22
0
    def loadParams(self, prefix):
        print "loading params: %s" % prefix

        # load connection weights
        if self.neuron_learning:
            self.getNode("actionvals").loadWeights(prefix)
            self.getNode("old_actionvals").loadWeights(prefix)
        else:
            with open(HRLutils.datafile(prefix + "_state_decoders.txt")) as f:
                self.getNode("state_pop").getOrigin("vals").setDecoders(
                    [[float(x) for x in d.split(" ")] for d in f.readlines()])

            with open(HRLutils.datafile(prefix +
                                        "_old_state_decoders.txt")) as f:
                self.getNode("old_state_pop").getOrigin("vals").setDecoders(
                    [[float(x) for x in d.split(" ")] for d in f.readlines()])

        # load state encoders
        with open(HRLutils.datafile(prefix + "_state_encoders.txt")) as f:
            enc = [[float(x) for x in e.split(" ")] for e in f.readlines()]
        self.getNode("state_pop").setEncoders(enc)
        # note we assume that state_pop and old_state_pop use the same encoders
        self.getNode("old_state_pop").setEncoders(enc)
Example #23
0
    def saveWeights(self, prefix):
        """Save the connection weights to file."""

        prefix = prefix + "_" + self.name
        for n in self.getNodes():
            if n.getName().startswith("action"):
                term = n.getTermination("learning")
                weights = [t.getWeights() for t in term.getNodeTerminations()]

                f = open(HRLutils.datafile(prefix + "_" + n.getName() + ".txt"), "w")
                f.write(str(HRLutils.SEED) + "\n")
                for row in weights:
                    f.write(" ".join([str(x) for x in row]) + "\n")
                f.close()
Example #24
0
def run_gridworld(args, seed=None):

    if seed is not None:
        HRLutils.set_seed(seed)
    seed = HRLutils.SEED

    net = nef.Network("run_gridworld")

    stateN = 400
    stateD = 2
    actions = [("up", [0, 1]), ("right", [1, 0]),
               ("down", [0, -1]), ("left", [-1, 0])]

    agent = smdpagent.SMDPAgent(stateN, stateD, actions, stateradius=3,
                                **args)
    net.add(agent)

    env = gridworldenvironment.GridWorldEnvironment(
        stateD, actions, HRLutils.datafile("smallgrid.txt"), cartesian=True,
        delay=(0.6, 0.9), datacollection=False)
    net.add(env)

    net.connect(env.getOrigin("state"), agent.getTermination("state_input"))
    net.connect(env.getOrigin("reward"), agent.getTermination("reward"))
    net.connect(env.getOrigin("reset"), agent.getTermination("reset"))
    net.connect(env.getOrigin("learn"), agent.getTermination("learn"))
    net.connect(env.getOrigin("reset"), agent.getTermination("save_state"))
    net.connect(env.getOrigin("reset"), agent.getTermination("save_action"))

    net.connect(agent.getOrigin("action_output"), env.getTermination("action"))
    net.connect(agent.getOrigin("Qs"), env.getTermination("Qs"))

    net.add_to_nengo()
    view = timeview.View(net.network, update_frequency=5)
    view.add_watch(gridworldwatch.GridWorldWatch())
    view.restore()
Example #25
0
    def saveWeights(self, prefix):
        """Save the connection weights to file."""

        prefix = prefix + "_" + self.name
        for n in self.getNodes():
            if n.getName().startswith("action"):
                term = n.getTermination("learning")
                weights = [t.getWeights() for t in term.getNodeTerminations()]

                f = open(
                    HRLutils.datafile(prefix + "_" + n.getName() + ".txt"),
                    "w")
                f.write(str(HRLutils.SEED) + "\n")
                for row in weights:
                    f.write(" ".join([str(x) for x in row]) + "\n")
                f.close()
Example #26
0
def test_bmp():
    from javax.imageio import ImageIO
    from java.io import File

    img = ImageIO.read(File(HRLutils.datafile("contextmap.bmp")))

    colours = [
        int(val)
        for val in img.getRGB(0, 0, img.getWidth(), img.getHeight(), None, 0,
                              img.getWidth())
    ]
    unique_colours = []
    for c in colours:
        if c not in unique_colours:
            unique_colours += [c]

    print unique_colours
Example #27
0
def test_actionvalues():
    net = nef.Network("testActionValues")

    stateN = 200
    N = 100
    stateD = 2
    stateradius = 1.0
    statelength = math.sqrt(2 * stateradius ** 2)
    init_Qs = 0.5
    learningrate = 0.0
    Qradius = 1
    tauPSC = 0.007
    actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]), ("left", [-1, 0])]

    # state
    state_pop = net.make(
        "state_pop",
        stateN,
        stateD,
        radius=statelength,
        node_factory=HRLutils.node_fac(),
        eval_points=[
            [x / statelength, y / statelength]
            for x in range(-int(stateradius), int(stateradius))
            for y in range(-int(stateradius), int(stateradius))
        ],
    )
    state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])
    state_pop.addDecodedTermination("state_input", MU.I(stateD), tauPSC, False)

    # set up action nodes
    decoders = state_pop.addDecodedOrigin("init_decoders", [ConstantFunction(stateD, init_Qs)], "AXON").getDecoders()

    actionvals = actionvalues.ActionValues(
        "testActionValues", N, stateN, actions, learningrate, Qradius=Qradius, init_decoders=decoders
    )
    net.add(actionvals)

    net.connect(state_pop.getOrigin("AXON"), actionvals.getTermination("state"))

    # input
    inp = net.make_input("input", [0, 0])
    net.connect(inp, state_pop.getTermination("state_input"))

    net.add_to_nengo()
    net.view()
Example #28
0
    def loadWeights(self, prefix):
        """Load the connection weights from file."""

        prefix = prefix + "_" + self.name
        for n in self.getNodes():
            if n.getName().startswith("action"):
                f = open(HRLutils.datafile(prefix + "_" + n.getName() + ".txt"), "r")
                seed = int(f.readline())
                if seed != HRLutils.SEED:
                    print "Warning, loading weights with a seed (" + seed + ") that doesn't match current (" + HRLutils.SEED + ")"
                weights = []
                for line in f:
                    weights += [[float(x) for x in line.split()]]
                f.close()

                term = n.getTermination("learning")
                for i, t in enumerate(term.getNodeTerminations()):
                    t.setWeights(weights[i], True)
Example #29
0
def test_placecell_bmp():
    net = nef.Network("TestPlacecellBmp")

    actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]), ("left", [-1, 0])]

    env = placecell_bmp.PlaceCellEnvironment(
        actions,
        HRLutils.datafile("contextmap.bmp"),
        colormap={-16777216: "wall", -1: "floor", -256: "target", -2088896: "b"},
        imgsize=(5, 5),
        dx=0.001,
        placedev=0.5,
    )
    net.add(env)

    print "generated", len(env.placecells), "placecells"

    net.add_to_nengo()
    net.view()
Example #30
0
    def tick(self):
        # check if env is currently giving reward (we want to give pseudoreward at the same time)
        if self.env.reward != 0:
            if self.target_answer is None:
                self.reward = 0
            else:
                # check if the selected action matches the correct action
                self.reward = self.rewardval if HRLutils.similarity(self.target_answer, self.action) > 0.5 else -self.rewardval
        else:
            self.reward = 0

            # update the target_answer (the action the low level should be selecting given
            # the current context)
            if self.context[0] == "orientation":
                self.target_answer = self.env.state[:self.env.num_orientations]
            elif self.context[0] == "shape":
                self.target_answer = self.env.state[self.env.num_orientations:-self.env.num_colours]
            else:
                self.target_answer = None
Example #31
0
def test_terminationnode():
    net = nef.Network("testTerminationNode")

    actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]),
               ("left", [-1, 0])]
    env = deliveryenvironment.DeliveryEnvironment(
        actions,
        HRLutils.datafile("contextmap.bmp"),
        colormap={
            -16777216: "wall",
            -1: "floor",
            -256: "a",
            -2088896: "b"
        },
        imgsize=(5, 5),
        dx=0.001,
        placedev=0.5)
    net.add(env)

    term_node = terminationnode.TerminationNode(
        {
            "a": [0, 1],
            "b": [1, 0],
            terminationnode.Timer((30, 30)): None
        },
        env,
        contextD=2,
        rewardval=1)
    net.add(term_node)

    print term_node.conds

    context_input = net.make_input("contextinput", {
        0.0: [0, 0.1],
        0.5: [1, 0],
        1.0: [0, 1]
    })
    net.connect(context_input, term_node.getTermination("context"))

    net.add_to_nengo()
    net.view()
Example #32
0
    def loadWeights(self, prefix):
        """Load the connection weights from file."""

        prefix = prefix + "_" + self.name
        for n in self.getNodes():
            if n.getName().startswith("action"):
                f = open(
                    HRLutils.datafile(prefix + "_" + n.getName() + ".txt"),
                    "r")
                seed = int(f.readline())
                if seed != HRLutils.SEED:
                    print("Warning, loading weights with a seed (" + seed +
                          ") that doesn't match current (" + HRLutils.SEED +
                          ")")
                weights = []
                for line in f:
                    weights += [[float(x) for x in line.split()]]
                f.close()

                term = n.getTermination("learning")
                for i, t in enumerate(term.getNodeTerminations()):
                    t.setWeights(weights[i], True)
Example #33
0
    def calc_optimal_move(self):
        """Calculates the optimal move for the agent to make in the current
        state.

        Used for debugging.
        """

        # grid search the image with the given stepsize
        stepsize = 0.1
        self.optimal_move = None
        for y in [
                v * stepsize for v in range(
                    int(-self.imgsize[1] / (2 * stepsize)) + 1,
                    int(self.imgsize[1] / (2 * stepsize)) - 1)
        ]:
            for x in [
                    v * stepsize for v in range(
                        int(-self.imgsize[0] / (2 * stepsize)) + 1,
                        int(self.imgsize[0] / (2 * stepsize)) - 1)
            ]:
                # if the pt you're looking at is in the region you're
                # looking for
                if self.is_in((x, y), "target"):
                    # generate a target point in the direction from current
                    # location to target
                    angle = math.atan2(y - self.state[1], x - self.state[0])
                    pt = (math.cos(angle), math.sin(angle))

                    # pick the action that is closest to the target point
                    # note: penalize actions that would involve moving through
                    # a wall
                    self.optimal_move = max(
                        self.actions,
                        key=lambda x: -1
                        if self.is_in((x[1][0] * self.dx + self.state[0], x[1][
                            1] * self.dx + self.state[1]), "wall"
                                      ) else HRLutils.similarity(x[1], pt))[0]
                    return
Example #34
0
def test_placecell_bmp():
    net = nef.Network("TestPlacecellBmp")

    actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]),
               ("left", [-1, 0])]

    env = placecell_bmp.PlaceCellEnvironment(
        actions,
        HRLutils.datafile("contextmap.bmp"),
        colormap={
            -16777216: "wall",
            -1: "floor",
            -256: "target",
            -2088896: "b"
        },
        imgsize=(5, 5),
        dx=0.001,
        placedev=0.5)
    net.add(env)

    print "generated", len(env.placecells), "placecells"

    net.add_to_nengo()
    net.view()
Example #35
0
    def __init__(self, actions, Qradius=1, noiselevel=0.03):
        """Builds the BGNetwork.

        :param actions: actions available to the system
            :type actions: list of tuples (action_name,action_vector)
        :param Qradius: expected radius of Q values
        :param noiselevel: standard deviation of noise added to Q values for
            exploration
        """

        self.name = "BGNetwork"
        net = nef.Network(self, seed=HRLutils.SEED, quick=False)

        self.N = 50
        self.d = len(actions)
        self.mut_inhib = 1.0  # mutual inhibition between actions
        self.tauPSC = 0.007

        # make basal ganglia
        netbg = nef.Network("bg")

        bginput = netbg.make("bginput", 1, self.d, mode="direct")
        bginput.fixMode()
        bginput.addDecodedTermination("input",
                                      MU.diag([1.0 / Qradius for _ in
                                               range(self.d)]), 0.001, False)
        # divide by Q radius to get values back into 0 -- 1 range

        bgoutput = netbg.make("bgoutput", 1, self.d, mode="direct")
        bgoutput.fixMode()

        basalganglia.make_basal_ganglia(netbg, bginput, bgoutput,
                                        dimensions=self.d, neurons=200)
        bg = netbg.network
        net.add(bg)
        bg.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

        bg.exposeTermination(bginput.getTermination("input"), "input")
        bg.exposeOrigin(bgoutput.getOrigin("X"), "X")

        # insert noise (used to give some randomness to drive exploration)
        noiselevel = net.make_input("noiselevel", [noiselevel])

        noise = noisenode.NoiseNode(1, dimension=len(actions))
        net.add(noise)

        net.connect(noiselevel, noise.getTermination("scale"))
        net.connect(noise.getOrigin("noise"), "bg.bginput", pstc=0.001)

        # add bias to shift everything up to 0.5--1.5
        biasinput = net.make_input("biasinput", [0.5])
        net.connect(biasinput, "bg.bginput",
                    transform=[[1] for _ in range(self.d)], pstc=0.001)

        # invert BG output (so the "selected" action will have a positive value
        # and the rest zero)
        invert = thalamus.make(net, name="invert", neurons=self.N,
                               dimensions=self.d, useQuick=False)
        invert.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])
        net.connect(bg, invert.getTermination("bg_input"))

        # add mutual inhibition
        net.connect(invert.getOrigin("xBiased"), invert, pstc=self.tauPSC,
                    transform=[[0 if i == j else -self.mut_inhib
                                for j in range(self.d)]
                               for i in range(self.d)])

        # threshold output values so that you get a nice clean 0 for
        # non-selected and 1 for selected
        threshf = HRLutils.node_fac()
        threshold = 0.1
        threshf.setIntercept(IndicatorPDF(threshold, 1.0))
        val_threshold = net.make_array("val_threshold", self.N * 2, self.d,
                                       node_factory=threshf, encoders=[[1]])
        val_threshold.addDecodedOrigin(
            "output",
            [PiecewiseConstantFunction([threshold], [0, 1])
             for _ in range(self.d)], "AXON", True)

        net.connect(invert.getOrigin("xBiased"), val_threshold,
                    pstc=self.tauPSC)

        # output action (action vectors weighted by BG output)
        weight_actions = net.make_array("weight_actions", 50,
                                        len(actions[0][1]), intercept=(0, 1))
        net.connect(val_threshold.getOrigin("output"), weight_actions,
                    transform=MU.transpose([actions[i][1]
                                            for i in range(self.d)]),
                    pstc=0.007)

        # save the BG output (selected action and selected action value)
        save_relay = net.make("save_relay", 1, 1, mode="direct")
        save_relay.fixMode()
        save_relay.addDecodedTermination("input", [[1]], 0.001, False)

        saved_action = memory.Memory("saved_action", self.N * 2,
                                     len(actions[0][1]), inputscale=75)
        net.add(saved_action)
        net.connect(weight_actions, saved_action.getTermination("target"))
        net.connect(save_relay, saved_action.getTermination("transfer"))

        saved_vals = memory.Memory("saved_values", self.N * 2, self.d,
                                   inputscale=75)
        net.add(saved_vals)
        net.connect(val_threshold.getOrigin("output"),
                    saved_vals.getTermination("target"))
        net.connect(save_relay, saved_vals.getTermination("transfer"))

        # put the saved values through a threshold (we want a nice clean
        # zero for non-selected values)
        nfac = HRLutils.node_fac()
        nfac.setIntercept(IndicatorPDF(0.2, 1))
        saved_vals_threshold = net.make_array("saved_vals_threshold", self.N,
                                              self.d, node_factory=nfac,
                                              encoders=[[1]])
        saved_vals_threshold.addDecodedOrigin(
            "output", [PiecewiseConstantFunction([0.3], [0, 1])
                       for _ in range(self.d)], "AXON", True)

        net.connect(saved_vals, saved_vals_threshold, pstc=self.tauPSC)

        self.exposeTermination(bg.getTermination("input"), "input")
        self.exposeTermination(save_relay.getTermination("input"),
                               "save_output")
        self.exposeOrigin(val_threshold.getOrigin("output"), "curr_vals")
        self.exposeOrigin(weight_actions.getOrigin("X"), "curr_action")
        self.exposeOrigin(saved_vals_threshold.getOrigin("output"),
                          "saved_vals")
        self.exposeOrigin(saved_action.getOrigin("X"), "saved_action")
Example #36
0
    def __init__(self, gamma, rewardradius=1.0):
        """Builds the ErrorCalc network.

        :param gamma: discount factor
        :param rewardradius: expected radius of reward values
        """

        self.name = "ErrorCalc"
        tauPSC = 0.007
        intPSC = 0.1
        N = 50

        ef = HRLutils.defaultEnsembleFactory()

        #current Q input
        currQ = ef.make("currQ", 1, 1)
        currQ.addDecodedTermination("input", [[1]], 0.001, False)
        self.addNode(currQ)
        currQ.setMode(SimulationMode.DIRECT)
        currQ.fixMode()
        self.exposeTermination(currQ.getTermination("input"), "currQ")

        #input population for resetting the network
        resetef = HRLutils.defaultEnsembleFactory()
        resetef.setEncoderFactory(vectorgenerators.DirectedVectorGenerator([1]))
        resetef.getNodeFactory().setIntercept(IndicatorPDF(0.3, 1.0))
        reset = resetef.make("reset", N, 1)
        reset.addDecodedTermination("input", [[1]], tauPSC, False)
        self.addNode(reset)
        reset.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])
        self.exposeTermination(reset.getTermination("input"), "reset")

        #store previous value of Q
        storeQ = memory.Memory("storeQ", N * 4, 1, inputscale=50)
        self.addNode(storeQ)
        self.addProjection(reset.getOrigin("X"), storeQ.getTermination("transfer"))
        self.addProjection(currQ.getOrigin("X"), storeQ.getTermination("target"))

        #calculate discount
        biasInput = FunctionInput("biasinput", [ConstantFunction(1, 1)], Units.UNK)
        self.addNode(biasInput)

        discount = memory.Memory("discount", N * 4, 1, inputscale=50, recurweight=gamma)
        self.addNode(discount)
        self.addProjection(biasInput.getOrigin("origin"), discount.getTermination("target"))
        self.addProjection(reset.getOrigin("X"), discount.getTermination("transfer"))

        #accumulate discounted reward
        #do we really need gamma to make this all work? if it proves to be a problem, could
        #try removing it, and just use un-discounted reward. we can just use the fact that
        #the reward integrator will saturate to prevent rewards from going to infinity
        discountreward = eprod.Eprod("discountreward", N * 4, 1, weights=[[[1.0 / rewardradius]], [[1.0]]], oneDinput=True)
        self.addNode(discountreward)
        self.exposeTermination(discountreward.getTermination("A"), "reward")
        self.addProjection(discount.getOrigin("X"), discountreward.getTermination("B"))

        reward = ef.make("reward", N * 4, 1)
        reward.addDecodedTermination("input", [[intPSC]], intPSC, False)
        reward.addDecodedTermination("feedback", [[1]], intPSC, False)
        reward.addTermination("gate", [[-8] for _ in range(reward.getNodeCount())], intPSC, False)
        self.addNode(reward)
        reward.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])
        self.addProjection(reward.getOrigin("X"), reward.getTermination("feedback"))
        self.addProjection(discountreward.getOrigin("X"), reward.getTermination("input"))
        self.addProjection(reset.getOrigin("X"), reward.getTermination("gate"))

        #weight currQ by discount
        discountcurrQ = eprod.Eprod("discountcurrQ", N * 4, 1, oneDinput=True)
        self.addNode(discountcurrQ)
        self.addProjection(currQ.getOrigin("X"), discountcurrQ.getTermination("A"))
        self.addProjection(discount.getOrigin("X"), discountcurrQ.getTermination("B"))

        #error calculation
        error = ef.make("error", N * 2, [2]) #radius of 2 since max error = maxQ + maxreward - 0 (unless we let Q values go negative)
        error.addDecodedTermination("currQ", [[1]], tauPSC, False)
        error.addDecodedTermination("reward", [[1]], tauPSC, False)
        error.addDecodedTermination("storeQ", [[-1]], tauPSC, False)
        self.addNode(error)
        self.addProjection(discountcurrQ.getOrigin("X"), error.getTermination("currQ"))
        self.addProjection(reward.getOrigin("X"), error.getTermination("reward"))
        self.addProjection(storeQ.getOrigin("X"), error.getTermination("storeQ"))
        self.exposeOrigin(error.getOrigin("X"), "X")
Example #37
0
def run_badreenvironment(nav_args, ctrl_args, bias=0.0, seed=None, flat=False,
                         label="tmp"):
    """Runs the model on the Badre et al. (2010) task."""

    if seed is not None:
        HRLutils.set_seed(seed)
    seed = HRLutils.SEED

    net = nef.Network("run_badreenvironment")

    env = badreenvironment.BadreEnvironment(flat=flat)
    net.add(env)

    # ##NAV AGENT
    stateN = 500
    max_state_input = 3
    enc = env.gen_encoders(stateN, 0, 0.0)

    # generate evaluation points
    orientations = MU.I(env.num_orientations)
    shapes = MU.I(env.num_shapes)
    colours = MU.I(env.num_colours)
    evals = (list(MU.diag([3 for _ in range(env.stateD)])) +
             [o + s + c
              for o in orientations for s in shapes for c in colours])

    # create lower level
    nav_agent = smdpagent.SMDPAgent(stateN, env.stateD, env.actions,
                                    name="NavAgent",
                                    stateradius=max_state_input,
                                    state_encoders=enc, state_evals=evals,
                                    discount=0.5, **nav_args)
    net.add(nav_agent)

    print "agent neurons:", nav_agent.countNeurons()

    # actions terminate on fixed schedule (aligned with environment)
    nav_term_node = terminationnode.TerminationNode(
        {terminationnode.Timer((0.6, 0.6)): None}, env, name="NavTermNode",
        state_delay=0.1, reset_delay=0.05, reset_interval=0.1)
    net.add(nav_term_node)

    net.connect(nav_term_node.getOrigin("reset"),
                nav_agent.getTermination("reset"))
    net.connect(nav_term_node.getOrigin("learn"),
                nav_agent.getTermination("learn"))
    net.connect(nav_term_node.getOrigin("reset"),
                nav_agent.getTermination("save_state"))
    net.connect(nav_term_node.getOrigin("reset"),
                nav_agent.getTermination("save_action"))

    net.connect(nav_agent.getOrigin("action_output"),
                env.getTermination("action"))

    # ##CTRL AGENT
    stateN = 500
    enc = RandomHypersphereVG().genVectors(stateN, env.stateD)
    actions = [("shape", [0, 1]), ("orientation", [1, 0]), ("null", [0, 0])]
    ctrl_agent = smdpagent.SMDPAgent(stateN, env.stateD, actions,
                                     name="CtrlAgent", state_encoders=enc,
                                     stateradius=max_state_input,
                                     state_evals=evals, discount=0.4,
                                     **ctrl_args)
    net.add(ctrl_agent)

    print "agent neurons:", ctrl_agent.countNeurons()

    net.connect(env.getOrigin("state"),
                ctrl_agent.getTermination("state_input"))

    ctrl_term_node = terminationnode.TerminationNode(
        {terminationnode.Timer((0.6, 0.6)): None}, env, name="CtrlTermNode",
        state_delay=0.1, reset_delay=0.05, reset_interval=0.1)
    net.add(ctrl_term_node)

    net.connect(ctrl_term_node.getOrigin("reset"),
                ctrl_agent.getTermination("reset"))
    net.connect(ctrl_term_node.getOrigin("learn"),
                ctrl_agent.getTermination("learn"))
    net.connect(ctrl_term_node.getOrigin("reset"),
                ctrl_agent.getTermination("save_state"))
    net.connect(ctrl_term_node.getOrigin("reset"),
                ctrl_agent.getTermination("save_action"))

    # ctrl gets a slight bonus if it selects a rule (as opposed to null), to
    # encourage it to not just pick null all the time
    reward_relay = net.make("reward_relay", 1, 3, mode="direct")
    reward_relay.fixMode()
    net.connect(env.getOrigin("reward"), reward_relay,
                transform=[[1], [0], [0]])
    net.connect(ctrl_agent.getOrigin("action_output"), reward_relay,
                transform=[[0, 0], [1, 0], [0, 1]])

    net.connect(reward_relay, ctrl_agent.getTermination("reward"),
                func=lambda x: ((x[0] + bias * abs(x[0]))
                                if x[1] + x[2] > 0.5 else x[0]),
                origin_name="ctrl_reward")

    # ideal reward function (for testing)
#     def ctrl_reward_func(x):
#         if abs(x[0]) < 0.5:
#             return 0.0
#
#         if flat:
#             return 1.5 if x[1] + x[2] < 0.5 else -1.5
#         else:
#             if x[1] + x[2] < 0.5:
#                 return -1.5
#             if [round(a) for a in env.state[-2:]] == [round(b)
#                                                       for b in x[1:]]:
#                 return 1.5
#             else:
#                 return -1.5
#     net.connect(reward_relay, ctrl_agent.getTermination("reward"),
#                 func=ctrl_reward_func)

    # nav rewarded for picking ctrl target
    def nav_reward_func(x):
        if abs(x[0]) < 0.5 or env.action is None:
            return 0.0

        if x[1] + x[2] < 0.5:
            return x[0]

        if x[1] > x[2]:
            return (1.5 if env.action[1] == env.state[:env.num_orientations]
                    else -1.5)
        else:
            return (1.5 if env.action[1] == env.state[env.num_orientations:
                                                      - env.num_colours]
                    else -1.5)
    net.connect(reward_relay, nav_agent.getTermination("reward"),
                func=nav_reward_func)

    # state for navagent controlled by ctrlagent
    ctrl_state_inhib = net.make_array("ctrl_state_inhib", 50, env.stateD,
                                      radius=2, mode=HRLutils.SIMULATION_MODE)
    ctrl_state_inhib.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

    inhib_matrix = [[0, -5]] * 50 * env.num_orientations + \
                   [[-5, 0]] * 50 * env.num_shapes + \
                   [[-5, -5]] * 50 * env.num_colours

    # ctrl output inhibits all the non-selected aspects of the state
    net.connect(env.getOrigin("state"), ctrl_state_inhib)
    net.connect(ctrl_agent.getOrigin("action_output"), ctrl_state_inhib,
                transform=inhib_matrix)

    # also give a boost to the selected aspects (so that neurons are roughly
    # equally activated).
    def boost_func(x):
        if x[0] > 0.5:
            return [3 * v for v in x[1:]]
        else:
            return x[1:]
    boost = net.make("boost", 1, 1 + env.stateD, mode="direct")
    boost.fixMode()
    net.connect(ctrl_state_inhib, boost,
                transform=([[0 for _ in range(env.stateD)]] +
                           list(MU.I(env.stateD))))
    net.connect(ctrl_agent.getOrigin("action_output"), boost,
                transform=[[1, 1]] + [[0, 0] for _ in range(env.stateD)])

    net.connect(boost, nav_agent.getTermination("state_input"),
                func=boost_func)

    # save weights
    weight_save = 1.0  # period to save weights (realtime, not simulation time)
    threads = [
        HRLutils.WeightSaveThread(nav_agent.getNode("QNetwork").saveParams,
                                  os.path.join("weights", "%s_%s" %
                                               (nav_agent.name, seed)),
                                  weight_save),
        HRLutils.WeightSaveThread(ctrl_agent.getNode("QNetwork").saveParams,
                                  os.path.join("weights", "%s_%s" %
                                               (ctrl_agent.name, seed)),
                                  weight_save)]
    for t in threads:
        t.start()

    # data collection node
    data = datanode.DataNode(period=1,
                             filename=HRLutils.datafile("dataoutput_%s.txt" %
                                                        label),
                             header="%s %s %s %s %s" % (nav_args, ctrl_args,
                                                        bias, seed, flat))
    print "saving data to", data.filename
    print "header", data.header
    net.add(data)
    nav_q = nav_agent.getNode("QNetwork")
    ctrl_q = ctrl_agent.getNode("QNetwork")
    ctrl_bg = ctrl_agent.getNode("BGNetwork").getNode("weight_actions")
    data.record_avg(env.getOrigin("reward"))
    data.record_avg(ctrl_q.getNode("actionvals").getOrigin("X"))
    data.record_sparsity(ctrl_q.getNode("state_pop").getOrigin("AXON"))
    data.record_sparsity(nav_q.getNode("state_pop").getOrigin("AXON"))
    data.record_avg(ctrl_q.getNode("valdiff").getOrigin("X"))
    data.record_avg(ctrl_agent.getNode("ErrorNetwork").getOrigin("error"))
    data.record_avg(ctrl_bg.getNode("0").getOrigin("AXON"))
    data.record_avg(ctrl_bg.getNode("1").getOrigin("AXON"))
    data.record(env.getOrigin("score"))

#     net.add_to_nengo()
#     net.network.simulator.run(0, 300, 0.001)
    net.view()

    for t in threads:
        t.stop()
Example #38
0
def run_contextenvironment(args, seed=None):
    """Runs the model on the context task.

    :param args: kwargs for the agent
    :param seed: random seed
    """

    if seed is not None:
        HRLutils.set_seed(seed)
    seed = HRLutils.SEED

    net = nef.Network("runContextEnvironment")

    if "load_weights" in args and args["load_weights"] is not None:
        args["load_weights"] += "_%s" % seed

    stateN = 1200  # number of neurons to use in state population
    contextD = 2  # dimension of context vector
    context_scale = 1.0  # scale of context representation
    max_state_input = 2  # max length of input vector for state population
    # actions (label and vector) available to the system
    actions = [("up", [0, 1]), ("right", [1, 0]),
               ("down", [0, -1]), ("left", [-1, 0])]

    # context labels and rewards for achieving those context goals
    rewards = {"a": 1.5, "b": 1.5}

    env = contextenvironment.ContextEnvironment(
        actions, HRLutils.datafile("contextmap.bmp"), contextD, rewards,
        colormap={-16777216: "wall", -1: "floor", -256: "a", -2088896: "b"},
        imgsize=(5, 5), dx=0.001, placedev=0.5)
    net.add(env)

    print "generated", len(env.placecells), "placecells"

    # termination node for agent (just goes off on some regular interval)
    term_node = terminationnode.TerminationNode(
        {terminationnode.Timer((0.6, 0.9)): 0.0}, env)
    net.add(term_node)

    # generate encoders and divide by max_state_input (so that all inputs
    # will end up being radius 1)
    enc = env.gen_encoders(stateN, contextD, context_scale)
    enc = MU.prod(enc, 1.0 / max_state_input)

    # load eval points from file
    with open(HRLutils.datafile("contextbmp_evalpoints_%s.txt" % seed)) as f:
        print "loading contextbmp_evalpoints_%s.txt" % seed
        evals = [[float(x) for x in l.split(" ")] for l in f.readlines()]

    agent = smdpagent.SMDPAgent(stateN, len(env.placecells) + contextD,
                                actions, state_encoders=enc, state_evals=evals,
                                state_threshold=0.8, **args)
    net.add(agent)

    print "agent neurons:", agent.countNeurons()

    # period to save weights (realtime, not simulation time)
    weight_save = 600.0
    t = HRLutils.WeightSaveThread(agent.getNode("QNetwork").saveParams,
                                  os.path.join("weights", "%s_%s" %
                                               (agent.name, seed)),
                                  weight_save)
    t.start()

    # data collection node
    data = datanode.DataNode(period=5,
                             filename=HRLutils.datafile("dataoutput_%s.txt" %
                                                        seed))
    net.add(data)
    q_net = agent.getNode("QNetwork")
    data.record(env.getOrigin("reward"))
    data.record(q_net.getNode("actionvals").getOrigin("X"), func=max)
    data.record(q_net.getNode("actionvals").getOrigin("X"), func=min)
    data.record_sparsity(q_net.getNode("state_pop").getOrigin("AXON"))
    data.record_avg(q_net.getNode("valdiff").getOrigin("X"))
    data.record_avg(env.getOrigin("state"))

    net.connect(env.getOrigin("placewcontext"),
                agent.getTermination("state_input"))
    net.connect(env.getOrigin("reward"), agent.getTermination("reward"))
    net.connect(term_node.getOrigin("reset"), agent.getTermination("reset"))
    net.connect(term_node.getOrigin("learn"), agent.getTermination("learn"))
    net.connect(term_node.getOrigin("reset"),
                agent.getTermination("save_state"))
    net.connect(term_node.getOrigin("reset"),
                agent.getTermination("save_action"))

    net.connect(agent.getOrigin("action_output"), env.getTermination("action"))

#    net.add_to_nengo()
#    net.run(2000)
    net.view()

    t.stop()
Example #39
0
    def __init__(self, discount, rewardradius=1.0, Qradius=1.0):
        """Builds the ErrorCalc2 network.

        :param discount: discount factor, controls rate of integration
        :param rewardradius: expected radius of reward value
        :param Qradius: expected radius of Q values
        """

        self.name = "ErrorCalc"
        net = nef.Network(self, seed=HRLutils.SEED, quick=False)

        tauPSC = 0.007
        intPSC = 0.1
        N = 50

        # relay for current Q input
        currQ = net.make("currQ", 1, 1, node_factory=HRLutils.node_fac(),
                         mode="direct", radius=Qradius)
        currQ.fixMode()
        currQ.addDecodedTermination("input", [[1]], 0.001, False)

        # input population for resetting the network
        reset_nodefac = HRLutils.node_fac()
        reset_nodefac.setIntercept(IndicatorPDF(0.3, 1.0))
        reset = net.make("reset", N, 1, encoders=[[1]],
                         node_factory=reset_nodefac)
        reset.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])
        # this population will begin outputting a value once the reset
        # signal exceeds the threshold, and that output will then be
        # used to reset the rest of the network

        reset.addDecodedTermination("input", [[1]], tauPSC, False)

        # relay for stored previous value of Q
        storeQ = net.make("storeQ", 1, 1, node_factory=HRLutils.node_fac(),
                          mode="direct", radius=Qradius)
        storeQ.fixMode()
        storeQ.addDecodedTermination("input", [[1]], 0.001, False)

        # calculate "discount" by integrating output of storeQ
        acc_storeQ = memory.Memory("acc_storeQ", N * 8, 1, inputscale=50)
        net.add(acc_storeQ)

        zero_input = net.make_input("zero_input", [0])

        net.connect(zero_input, acc_storeQ.getTermination("target"))
        net.connect(reset, acc_storeQ.getTermination("transfer"))

        # threshold storeQ value so it won't go below zero.  that is, if we
        # have negative Q values, we don't want to have a negative discount,
        # or that will just drive the highest (negative) Q value upwards, and
        # it will always be selected.  negative Q values are instead pushed
        # upwards by the PositiveBias mechanism.
        Qthresh = net.make("Qthresh", N * 2, 1, encoders=[[1]],
                           eval_points=[[x * 0.001] for x in range(1000)],
                           radius=Qradius, intercept=(0, 1))
        net.connect(storeQ, Qthresh, pstc=tauPSC)
        net.connect(Qthresh, acc_storeQ, pstc=intPSC,
                    transform=[[discount * intPSC]], func=lambda x: max(x[0],
                                                                        0.0))

        # accumulate  reward
        reward = memory.Memory("reward", N * 4, 1, radius=rewardradius,
                               inputscale=50)
        net.add(reward)

        reward.addDecodedTermination("input", [[intPSC]], intPSC, False)

        net.connect(zero_input, reward.getTermination("target"))
        net.connect(reset, reward.getTermination("transfer"))

        # put reward, currQ, storeQ, and discount together to calculate error
        error = net.make("error", N * 2, 1, node_factory=HRLutils.node_fac())

        net.connect(currQ, error, pstc=tauPSC)
        net.connect(reward, error, pstc=tauPSC)
        net.connect(storeQ, error, pstc=tauPSC, transform=[[-1]])
        net.connect(acc_storeQ, error, pstc=tauPSC, transform=[[-1]])

        self.exposeTermination(reward.getTermination("input"), "reward")
        self.exposeTermination(reset.getTermination("input"), "reset")
        self.exposeTermination(currQ.getTermination("input"), "currQ")
        self.exposeTermination(storeQ.getTermination("input"), "storeQ")
        self.exposeOrigin(error.getOrigin("X"), "X")
Example #40
0
    def __init__(self, num_actions, Qradius=1.0, rewardradius=1.0, discount=0.3):
        """Builds the ErrorNetwork.

        :param num_actions: the number of actions available to the system
        :param Qradius: expected radius of Q values
        :param rewardradius: expected radius of reward signal
        :param discount: discount factor
        """

        self.name = "ErrorNetwork"
        net = nef.Network(self, seed=HRLutils.SEED, quick=False)

        N = 50
        tauPSC = 0.007
        errorcap = 0.1 #soft cap on error magnitude (large errors seem to cause problems 
            #with overly-generalizing the learning)

        #set up relays
        vals_relay = net.make("vals_relay", 1, num_actions, mode="direct")
        vals_relay.fixMode()
        vals_relay.addDecodedTermination("input", MU.I(num_actions), 0.001, False)

        old_vals_relay = net.make("old_vals_relay", 1, num_actions, mode="direct")
        old_vals_relay.fixMode()
        old_vals_relay.addDecodedTermination("input", MU.I(num_actions), 0.001, False)

        curr_bg_relay = net.make("curr_bg_relay", 1, num_actions, mode="direct")
        curr_bg_relay.fixMode()
        curr_bg_relay.addDecodedTermination("input", MU.I(num_actions), 0.001, False)

        saved_bg_relay = net.make("saved_bg_relay", 1, num_actions, mode="direct")
        saved_bg_relay.fixMode()
        saved_bg_relay.addDecodedTermination("input", MU.I(num_actions), 0.001, False)


        #select out only the currently chosen Q value
        gatedQ = net.make_array("gatedQ", N, num_actions, node_factory=HRLutils.node_fac(), radius=Qradius)
        gatedQ.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

        net.connect(vals_relay, gatedQ, pstc=tauPSC)

        net.connect(curr_bg_relay, gatedQ,
                    transform=[[-3 if i != k else 0 for k in range(num_actions)]
                               for i in range(num_actions) for _ in range(gatedQ.getNeurons() / num_actions)],
                    pstc=tauPSC)

        currQ = net.make("currQ", 1, 1, mode="direct")
        currQ.fixMode()
        net.connect(gatedQ, currQ, transform=[[1 for _ in range(num_actions)]], pstc=0.001)

        #select out only the previously chosen Q value
        gatedstoreQ = net.make_array("gatedstoreQ", N, num_actions, node_factory=HRLutils.node_fac(), radius=Qradius)
        gatedstoreQ.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

        net.connect(old_vals_relay, gatedstoreQ, pstc=tauPSC)

        net.connect(saved_bg_relay, gatedstoreQ,
                    transform=[[-3 if i != k else 0 for k in range(num_actions)]
                               for i in range(num_actions) for _ in range(gatedstoreQ.getNeurons() / num_actions)],
                    pstc=tauPSC)

        storeQ = net.make("storeQ", 1, 1, mode="direct")
        storeQ.fixMode()
        net.connect(gatedstoreQ, storeQ, transform=[[1 for _ in range(num_actions)]], pstc=0.001)

        #create error calculation network
        error = errorcalc2.ErrorCalc2(discount, rewardradius=rewardradius, Qradius=Qradius)
        net.add(error)

        net.connect(currQ, error.getTermination("currQ"))
        net.connect(storeQ, error.getTermination("storeQ"))

        #gate error by learning signal and saved BG output (we only want error when the
        #system is supposed to be learning, and we only want error related to the action
        #that was selected)
        gatederror = net.make_array("gatederror", N * 2, num_actions, radius=errorcap, node_factory=HRLutils.node_fac())
        gatederror.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

        net.connect(error, gatederror, transform=[[1.0 / Qradius] for _ in range(num_actions)], pstc=tauPSC)
            #scale the error by Qradius, so that we don't get super huge errors (screws up the gating)

        learninggate = net.make("learninggate", N, 1, node_factory=HRLutils.node_fac())
        learninggate.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])
        learninggate.addTermination("gate", [[-10] for _ in range(N)], tauPSC, False)

        net.connect(learninggate, gatederror, func=lambda x: [1.0],
                    transform=[[-12] for _ in range(gatederror.getNeurons())], pstc=tauPSC)

        net.connect(saved_bg_relay, gatederror,
                    transform=[[-12 if i != k else 0 for k in range(num_actions)]
                               for i in range(num_actions) for _ in range(gatederror.getNeurons() / num_actions)],
                    pstc=tauPSC)

        #add a positive bias to the error anywhere the Q values are negative (to stop
        #Q values from getting too negative, which screws up the action selection)
        posbias = positivebias.PositiveBias(N, num_actions)
        net.add(posbias)
        net.connect(old_vals_relay, posbias.getTermination("input"))
        net.connect(learninggate, posbias.getTermination("learn"), func=lambda x: [1.0])

        biasederror = net.make("biasederror", 1, num_actions, mode="direct")
        biasederror.fixMode()
        net.connect(gatederror, biasederror, pstc=0.001)
        net.connect(posbias, biasederror, pstc=0.001)

        self.exposeTermination(curr_bg_relay.getTermination("input"), "curr_bg_input")
        self.exposeTermination(saved_bg_relay.getTermination("input"), "saved_bg_input")
        self.exposeTermination(vals_relay.getTermination("input"), "vals")
        self.exposeTermination(old_vals_relay.getTermination("input"), "old_vals")
        self.exposeTermination(error.getTermination("reward"), "reward")
        self.exposeTermination(error.getTermination("reset"), "reset")
        self.exposeTermination(learninggate.getTermination("gate"), "learn")
        self.exposeOrigin(biasederror.getOrigin("X"), "error")
Example #41
0
def run_deliveryenvironment(navargs, ctrlargs, tag=None, seed=None):
    """Runs the model on the delivery task.

    :param navargs: kwargs for the nav_agent (see SMDPAgent.__init__)
    :param ctrlargs: kwargs for the ctrl_agent (see SMDPAgent.__init__)
    :param tag: string appended to datafiles associated with this run
    :param seed: random seed used for this run
    """

    if seed is not None:
        HRLutils.set_seed(seed)
    seed = HRLutils.SEED

    if tag is None:
        tag = str(seed)

    net = nef.Network("runDeliveryEnvironment", seed=seed)

    stateN = 1200  # number of neurons to use in state population
    contextD = 2  # dimension of context vector
    context_scale = 1.0  # relative scale of context vector vs state vector
    max_state_input = 2  # maximum length of input vector to state population

    # labels and vectors corresponding to basic actions available to the system
    actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]),
               ("left", [-1, 0])]

    if "load_weights" in navargs and navargs["load_weights"] is not None:
        navargs["load_weights"] += "_%s" % tag
    if "load_weights" in ctrlargs and ctrlargs["load_weights"] is not None:
        ctrlargs["load_weights"] += "_%s" % tag

    # ##ENVIRONMENT

    env = deliveryenvironment.DeliveryEnvironment(
        actions,
        HRLutils.datafile("contextmap.bmp"),
        colormap={
            -16777216: "wall",
            -1: "floor",
            -256: "a",
            -2088896: "b"
        },
        imgsize=(5, 5),
        dx=0.001,
        placedev=0.5)
    net.add(env)

    print "generated", len(env.placecells), "placecells"

    # ##NAV AGENT

    # generate encoders and divide them by max_state_input (so that inputs
    # will be scaled down to radius 1)
    enc = env.gen_encoders(stateN, contextD, context_scale)
    enc = MU.prod(enc, 1.0 / max_state_input)

    # read in eval points from file
    with open(HRLutils.datafile("contextbmp_evalpoints_%s.txt" % tag)) as f:
        evals = [[float(x) for x in l.split(" ")] for l in f.readlines()]

    nav_agent = smdpagent.SMDPAgent(stateN,
                                    len(env.placecells) + contextD,
                                    actions,
                                    name="NavAgent",
                                    state_encoders=enc,
                                    state_evals=evals,
                                    state_threshold=0.8,
                                    **navargs)
    net.add(nav_agent)

    print "agent neurons:", nav_agent.countNeurons()

    # output of nav_agent is what goes to the environment
    net.connect(nav_agent.getOrigin("action_output"),
                env.getTermination("action"))

    # termination node for nav_agent (just a timer that goes off regularly)
    nav_term_node = terminationnode.TerminationNode(
        {terminationnode.Timer((0.6, 0.9)): None},
        env,
        contextD=2,
        name="NavTermNode")
    net.add(nav_term_node)

    net.connect(nav_term_node.getOrigin("reset"),
                nav_agent.getTermination("reset"))
    net.connect(nav_term_node.getOrigin("learn"),
                nav_agent.getTermination("learn"))
    net.connect(nav_term_node.getOrigin("reset"),
                nav_agent.getTermination("save_state"))
    net.connect(nav_term_node.getOrigin("reset"),
                nav_agent.getTermination("save_action"))

    # ##CTRL AGENT

    # actions corresponding to "go to A" or "go to B"
    actions = [("a", [0, 1]), ("b", [1, 0])]
    ctrl_agent = smdpagent.SMDPAgent(stateN,
                                     len(env.placecells) + contextD,
                                     actions,
                                     name="CtrlAgent",
                                     state_encoders=enc,
                                     state_evals=evals,
                                     state_threshold=0.8,
                                     **ctrlargs)
    net.add(ctrl_agent)

    print "agent neurons:", ctrl_agent.countNeurons()

    # ctrl_agent gets environmental state and reward
    net.connect(env.getOrigin("placewcontext"),
                ctrl_agent.getTermination("state_input"))
    net.connect(env.getOrigin("reward"), ctrl_agent.getTermination("reward"))

    # termination node for ctrl_agent (terminates whenever the agent is in the
    # state targeted by the ctrl_agent)
    # also has a long timer so that ctrl_agent doesn't get permanently stuck
    # in one action
    ctrl_term_node = terminationnode.TerminationNode(
        {
            "a": [0, 1],
            "b": [1, 0],
            terminationnode.Timer((30, 30)): None
        },
        env,
        contextD=2,
        name="CtrlTermNode",
        rewardval=1.5)
    net.add(ctrl_term_node)

    # reward for nav_agent is the pseudoreward from ctrl_agent termination
    net.connect(ctrl_term_node.getOrigin("pseudoreward"),
                nav_agent.getTermination("reward"))

    net.connect(ctrl_term_node.getOrigin("reset"),
                ctrl_agent.getTermination("reset"))
    net.connect(ctrl_term_node.getOrigin("learn"),
                ctrl_agent.getTermination("learn"))
    net.connect(ctrl_term_node.getOrigin("reset"),
                ctrl_agent.getTermination("save_state"))
    net.connect(ctrl_term_node.getOrigin("reset"),
                ctrl_agent.getTermination("save_action"))

    # connect ctrl_agent action to termination context
    # this is used so that ctrl_term_node knows what the current goal is (to
    # determine termination and pseudoreward)
    net.connect(ctrl_agent.getOrigin("action_output"),
                ctrl_term_node.getTermination("context"))

    # state input for nav_agent is the environmental state + the output of
    # ctrl_agent
    ctrl_output_relay = net.make("ctrl_output_relay",
                                 1,
                                 len(env.placecells) + contextD,
                                 mode="direct")
    ctrl_output_relay.fixMode()
    trans = (list(MU.I(len(env.placecells))) +
             [[0 for _ in range(len(env.placecells))]
              for _ in range(contextD)])
    net.connect(env.getOrigin("place"), ctrl_output_relay, transform=trans)
    net.connect(ctrl_agent.getOrigin("action_output"),
                ctrl_output_relay,
                transform=([[0 for _ in range(contextD)]
                            for _ in range(len(env.placecells))] +
                           list(MU.I(contextD))))

    net.connect(ctrl_output_relay, nav_agent.getTermination("state_input"))

    # periodically save the weights

    # period to save weights (realtime, not simulation time)
    weight_save = 600.0

    threads = [
        HRLutils.WeightSaveThread(
            nav_agent.getNode("QNetwork").saveParams,
            os.path.join("weights", "%s_%s" % (nav_agent.name, tag)),
            weight_save),
        HRLutils.WeightSaveThread(
            ctrl_agent.getNode("QNetwork").saveParams,
            os.path.join("weights", "%s_%s" % (ctrl_agent.name, tag)),
            weight_save)
    ]

    for t in threads:
        t.start()

    # data collection node
    data = datanode.DataNode(period=5,
                             filename=HRLutils.datafile("dataoutput_%s.txt" %
                                                        tag))
    net.add(data)
    data.record(env.getOrigin("reward"))
    q_net = ctrl_agent.getNode("QNetwork")
    data.record(q_net.getNode("actionvals").getOrigin("X"), func=max)
    data.record(q_net.getNode("actionvals").getOrigin("X"), func=min)
    data.record_sparsity(q_net.getNode("state_pop").getOrigin("AXON"))
    data.record_avg(q_net.getNode("valdiff").getOrigin("X"))
    data.record_avg(ctrl_agent.getNode("ErrorNetwork").getOrigin("error"))

    #     net.add_to_nengo()
    #     net.run(10000)
    net.view()

    for t in threads:
        t.stop()
Example #42
0
    def __init__(self, name, N, d, radius=1.0, inputscale=1.0, recurweight=1.0,
                 direct_storage=False):
        """Builds the Memory network.

        :param name: name of network
        :param N: base number of neurons
        :param d: dimension of stored value
        :param radius: radius of stored value
        :param inputscale: controls how fast the stored value moves to the
            target
        :param recurweight: controls the preservation of the stored value
        :param direct_storage: if True, use directmode for the memory
        """

        self.name = name
        net = nef.Network(self, seed=HRLutils.SEED, quick=False)
        self.dimension = d
        self.radius = radius

        tauPSC = 0.007
        intPSC = 0.1

        # population that will store the value
        if not direct_storage:
            storage = net.make_array("storage", N, d,
                                     node_factory=HRLutils.node_fac(),
                                     eval_points=[[x * 0.001]
                                                  for x in range(-1000, 1000)])
        else:
            storage = net.make("storage", 1, d, mode="direct")
            storage.fixMode()

        net.connect(storage, storage, transform=MU.diag([recurweight
                                                         for _ in range(d)]),
                    pstc=intPSC)

        # storageinput will represent (target - stored_value), which when used
        # as input to storage will drive the stored value to target
        storageinput = net.make_array("storageinput", N, d,
                                      node_factory=HRLutils.node_fac())
        storageinput.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

        storageinput.addDecodedTermination("target",
                                           MU.diag([1.0 / radius
                                                    for _ in range(d)]),
                                           tauPSC, False)
        # note: store everything in -1 -- 1 range by dividing by radius

        # scale storageinput value by inputscale to control rate at which
        # it moves to the target
        net.connect(storageinput, storage, pstc=intPSC,
                    transform=MU.diag([inputscale * intPSC for _ in range(d)]))

        # subtract currently stored value
        net.connect(storage, storageinput, pstc=tauPSC,
                    transform=MU.diag([-1 for _ in range(d)]))

        # we want to open the input gate when the transfer signal arrives (to
        # transfer storageinput to storage). using a double inhibition setup
        # (rather than just feeding it e.g. the the inverse of the transfer
        # signal) so that we get a nice clean zero

        # this inhibits the storageinput population (to block input to the
        # storage)
        transferinhib = net.make("transferinhib", N, 1,
                                 node_factory=HRLutils.node_fac())
        transferinhib.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

        transferinhib.addTermination("gate",
                                     [[-10] for _ in
                                      range(transferinhib.getNeurons())],
                                     tauPSC, False)

        net.connect(transferinhib, storageinput, pstc=tauPSC,
                    transform=[[-10] for _ in
                               range(storageinput.getNeurons())])

        # this drives the transferinhib population (so that by default it will
        # block any input). inhibiting transferinhib will thus remove the
        # inhibition on storageinput, and change the stored value
        biasinput = net.make_input("biasinput", [1])

        net.connect(biasinput, transferinhib, pstc=tauPSC)

        # output population (to undo radius scaling)
        storageoutput = net.make("storageoutput", 1, d, mode="direct")
        storageoutput.fixMode()
        net.connect(storage, storageoutput, pstc=0.001,
                    transform=MU.diag([radius for _ in range(d)]))

        self.exposeTermination(transferinhib.getTermination("gate"),
                               "transfer")
        self.exposeTermination(storageinput.getTermination("target"), "target")
        self.exposeOrigin(storageoutput.getOrigin("X"), "X")
Example #43
0
def run_deliveryenvironment(navargs, ctrlargs, tag=None, seed=None):
    """Runs the model on the delivery task.

    :param navargs: kwargs for the nav_agent (see SMDPAgent.__init__)
    :param ctrlargs: kwargs for the ctrl_agent (see SMDPAgent.__init__)
    :param tag: string appended to datafiles associated with this run
    :param seed: random seed used for this run
    """

    if seed is not None:
        HRLutils.set_seed(seed)
    seed = HRLutils.SEED

    if tag is None:
        tag = str(seed)

    net = nef.Network("runDeliveryEnvironment", seed=seed)

    stateN = 1200  # number of neurons to use in state population
    contextD = 2  # dimension of context vector
    context_scale = 1.0  # relative scale of context vector vs state vector
    max_state_input = 2  # maximum length of input vector to state population

    # labels and vectors corresponding to basic actions available to the system
    actions = [("up", [0, 1]), ("right", [1, 0]),
               ("down", [0, -1]), ("left", [-1, 0])]

    if "load_weights" in navargs and navargs["load_weights"] is not None:
        navargs["load_weights"] += "_%s" % tag
    if "load_weights" in ctrlargs and ctrlargs["load_weights"] is not None:
        ctrlargs["load_weights"] += "_%s" % tag

    # ##ENVIRONMENT

    env = deliveryenvironment.DeliveryEnvironment(
        actions, HRLutils.datafile("contextmap.bmp"),
        colormap={-16777216: "wall", -1: "floor", -256: "a", -2088896: "b"},
        imgsize=(5, 5), dx=0.001, placedev=0.5)
    net.add(env)

    print "generated", len(env.placecells), "placecells"

    # ##NAV AGENT

    # generate encoders and divide them by max_state_input (so that inputs
    # will be scaled down to radius 1)
    enc = env.gen_encoders(stateN, contextD, context_scale)
    enc = MU.prod(enc, 1.0 / max_state_input)

    # read in eval points from file
    with open(HRLutils.datafile("contextbmp_evalpoints_%s.txt" % tag)) as f:
        evals = [[float(x) for x in l.split(" ")] for l in f.readlines()]

    nav_agent = smdpagent.SMDPAgent(stateN, len(env.placecells) + contextD,
                                    actions, name="NavAgent",
                                    state_encoders=enc, state_evals=evals,
                                    state_threshold=0.8,
                                    **navargs)
    net.add(nav_agent)

    print "agent neurons:", nav_agent.countNeurons()

    # output of nav_agent is what goes to the environment
    net.connect(nav_agent.getOrigin("action_output"),
                env.getTermination("action"))

    # termination node for nav_agent (just a timer that goes off regularly)
    nav_term_node = terminationnode.TerminationNode(
        {terminationnode.Timer((0.6, 0.9)): None}, env, contextD=2,
        name="NavTermNode")
    net.add(nav_term_node)

    net.connect(nav_term_node.getOrigin("reset"),
                nav_agent.getTermination("reset"))
    net.connect(nav_term_node.getOrigin("learn"),
                nav_agent.getTermination("learn"))
    net.connect(nav_term_node.getOrigin("reset"),
                nav_agent.getTermination("save_state"))
    net.connect(nav_term_node.getOrigin("reset"),
                nav_agent.getTermination("save_action"))

    # ##CTRL AGENT

    # actions corresponding to "go to A" or "go to B"
    actions = [("a", [0, 1]), ("b", [1, 0])]
    ctrl_agent = smdpagent.SMDPAgent(stateN, len(env.placecells) + contextD,
                                     actions, name="CtrlAgent",
                                     state_encoders=enc, state_evals=evals,
                                     state_threshold=0.8, **ctrlargs)
    net.add(ctrl_agent)

    print "agent neurons:", ctrl_agent.countNeurons()

    # ctrl_agent gets environmental state and reward
    net.connect(env.getOrigin("placewcontext"),
                ctrl_agent.getTermination("state_input"))
    net.connect(env.getOrigin("reward"),
                ctrl_agent.getTermination("reward"))

    # termination node for ctrl_agent (terminates whenever the agent is in the
    # state targeted by the ctrl_agent)
    # also has a long timer so that ctrl_agent doesn't get permanently stuck
    # in one action
    ctrl_term_node = terminationnode.TerminationNode(
        {"a": [0, 1], "b": [1, 0], terminationnode.Timer((30, 30)): None},
        env, contextD=2, name="CtrlTermNode", rewardval=1.5)
    net.add(ctrl_term_node)

    # reward for nav_agent is the pseudoreward from ctrl_agent termination
    net.connect(ctrl_term_node.getOrigin("pseudoreward"),
                nav_agent.getTermination("reward"))

    net.connect(ctrl_term_node.getOrigin("reset"),
                ctrl_agent.getTermination("reset"))
    net.connect(ctrl_term_node.getOrigin("learn"),
                ctrl_agent.getTermination("learn"))
    net.connect(ctrl_term_node.getOrigin("reset"),
                ctrl_agent.getTermination("save_state"))
    net.connect(ctrl_term_node.getOrigin("reset"),
                ctrl_agent.getTermination("save_action"))

    # connect ctrl_agent action to termination context
    # this is used so that ctrl_term_node knows what the current goal is (to
    # determine termination and pseudoreward)
    net.connect(ctrl_agent.getOrigin("action_output"),
                ctrl_term_node.getTermination("context"))

    # state input for nav_agent is the environmental state + the output of
    # ctrl_agent
    ctrl_output_relay = net.make("ctrl_output_relay", 1,
                                 len(env.placecells) + contextD, mode="direct")
    ctrl_output_relay.fixMode()
    trans = (list(MU.I(len(env.placecells))) +
             [[0 for _ in range(len(env.placecells))]
              for _ in range(contextD)])
    net.connect(env.getOrigin("place"), ctrl_output_relay, transform=trans)
    net.connect(ctrl_agent.getOrigin("action_output"), ctrl_output_relay,
                transform=([[0 for _ in range(contextD)]
                            for _ in range(len(env.placecells))] +
                           list(MU.I(contextD))))

    net.connect(ctrl_output_relay, nav_agent.getTermination("state_input"))

    # periodically save the weights

    # period to save weights (realtime, not simulation time)
    weight_save = 600.0

    threads = [
        HRLutils.WeightSaveThread(nav_agent.getNode("QNetwork").saveParams,
                                  os.path.join("weights", "%s_%s" %
                                               (nav_agent.name, tag)),
                                  weight_save),
        HRLutils.WeightSaveThread(ctrl_agent.getNode("QNetwork").saveParams,
                                  os.path.join("weights", "%s_%s" %
                                               (ctrl_agent.name, tag)),
                                  weight_save)]

    for t in threads:
        t.start()

    # data collection node
    data = datanode.DataNode(period=5,
                             filename=HRLutils.datafile("dataoutput_%s.txt" %
                                                        tag))
    net.add(data)
    data.record(env.getOrigin("reward"))
    q_net = ctrl_agent.getNode("QNetwork")
    data.record(q_net.getNode("actionvals").getOrigin("X"), func=max)
    data.record(q_net.getNode("actionvals").getOrigin("X"), func=min)
    data.record_sparsity(q_net.getNode("state_pop").getOrigin("AXON"))
    data.record_avg(q_net.getNode("valdiff").getOrigin("X"))
    data.record_avg(ctrl_agent.getNode("ErrorNetwork").getOrigin("error"))

#     net.add_to_nengo()
#     net.run(10000)
    net.view()

    for t in threads:
        t.stop()
Example #44
0
def run_contextenvironment(args, seed=None):
    """Runs the model on the context task.

    :param args: kwargs for the agent
    :param seed: random seed
    """

    if seed is not None:
        HRLutils.set_seed(seed)
    seed = HRLutils.SEED

    net = nef.Network("runContextEnvironment")

    if "load_weights" in args and args["load_weights"] is not None:
        args["load_weights"] += "_%s" % seed

    stateN = 1200  # number of neurons to use in state population
    contextD = 2  # dimension of context vector
    context_scale = 1.0  # scale of context representation
    max_state_input = 2  # max length of input vector for state population
    # actions (label and vector) available to the system
    actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]),
               ("left", [-1, 0])]

    # context labels and rewards for achieving those context goals
    rewards = {"a": 1.5, "b": 1.5}

    env = contextenvironment.ContextEnvironment(
        actions,
        HRLutils.datafile("contextmap.bmp"),
        contextD,
        rewards,
        colormap={
            -16777216: "wall",
            -1: "floor",
            -256: "a",
            -2088896: "b"
        },
        imgsize=(5, 5),
        dx=0.001,
        placedev=0.5)
    net.add(env)

    print "generated", len(env.placecells), "placecells"

    # termination node for agent (just goes off on some regular interval)
    term_node = terminationnode.TerminationNode(
        {terminationnode.Timer((0.6, 0.9)): 0.0}, env)
    net.add(term_node)

    # generate encoders and divide by max_state_input (so that all inputs
    # will end up being radius 1)
    enc = env.gen_encoders(stateN, contextD, context_scale)
    enc = MU.prod(enc, 1.0 / max_state_input)

    # load eval points from file
    with open(HRLutils.datafile("contextbmp_evalpoints_%s.txt" % seed)) as f:
        print "loading contextbmp_evalpoints_%s.txt" % seed
        evals = [[float(x) for x in l.split(" ")] for l in f.readlines()]

    agent = smdpagent.SMDPAgent(stateN,
                                len(env.placecells) + contextD,
                                actions,
                                state_encoders=enc,
                                state_evals=evals,
                                state_threshold=0.8,
                                **args)
    net.add(agent)

    print "agent neurons:", agent.countNeurons()

    # period to save weights (realtime, not simulation time)
    weight_save = 600.0
    t = HRLutils.WeightSaveThread(
        agent.getNode("QNetwork").saveParams,
        os.path.join("weights", "%s_%s" % (agent.name, seed)), weight_save)
    t.start()

    # data collection node
    data = datanode.DataNode(period=5,
                             filename=HRLutils.datafile("dataoutput_%s.txt" %
                                                        seed))
    net.add(data)
    q_net = agent.getNode("QNetwork")
    data.record(env.getOrigin("reward"))
    data.record(q_net.getNode("actionvals").getOrigin("X"), func=max)
    data.record(q_net.getNode("actionvals").getOrigin("X"), func=min)
    data.record_sparsity(q_net.getNode("state_pop").getOrigin("AXON"))
    data.record_avg(q_net.getNode("valdiff").getOrigin("X"))
    data.record_avg(env.getOrigin("state"))

    net.connect(env.getOrigin("placewcontext"),
                agent.getTermination("state_input"))
    net.connect(env.getOrigin("reward"), agent.getTermination("reward"))
    net.connect(term_node.getOrigin("reset"), agent.getTermination("reset"))
    net.connect(term_node.getOrigin("learn"), agent.getTermination("learn"))
    net.connect(term_node.getOrigin("reset"),
                agent.getTermination("save_state"))
    net.connect(term_node.getOrigin("reset"),
                agent.getTermination("save_action"))

    net.connect(agent.getOrigin("action_output"), env.getTermination("action"))

    #    net.add_to_nengo()
    #    net.run(2000)
    net.view()

    t.stop()
Example #45
0
    def __init__(self, gamma, rewardradius=1.0):
        """Builds the ErrorCalc network.

        :param gamma: discount factor
        :param rewardradius: expected radius of reward values
        """

        self.name = "ErrorCalc"
        tauPSC = 0.007
        intPSC = 0.1
        N = 50

        ef = HRLutils.defaultEnsembleFactory()

        # current Q input
        currQ = ef.make("currQ", 1, 1)
        currQ.addDecodedTermination("input", [[1]], 0.001, False)
        self.addNode(currQ)
        currQ.setMode(SimulationMode.DIRECT)
        currQ.fixMode()
        self.exposeTermination(currQ.getTermination("input"), "currQ")

        # input population for resetting the network
        resetef = HRLutils.defaultEnsembleFactory()
        resetef.setEncoderFactory(vectorgenerators.DirectedVectorGenerator([1
                                                                            ]))
        resetef.getNodeFactory().setIntercept(IndicatorPDF(0.3, 1.0))
        reset = resetef.make("reset", N, 1)
        reset.addDecodedTermination("input", [[1]], tauPSC, False)
        self.addNode(reset)
        reset.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])
        self.exposeTermination(reset.getTermination("input"), "reset")

        # store previous value of Q
        storeQ = memory.Memory("storeQ", N * 4, 1, inputscale=50)
        self.addNode(storeQ)
        self.addProjection(reset.getOrigin("X"),
                           storeQ.getTermination("transfer"))
        self.addProjection(currQ.getOrigin("X"),
                           storeQ.getTermination("target"))

        # calculate discount
        biasInput = FunctionInput("biasinput", [ConstantFunction(1, 1)],
                                  Units.UNK)
        self.addNode(biasInput)

        discount = memory.Memory("discount",
                                 N * 4,
                                 1,
                                 inputscale=50,
                                 recurweight=gamma)
        self.addNode(discount)
        self.addProjection(biasInput.getOrigin("origin"),
                           discount.getTermination("target"))
        self.addProjection(reset.getOrigin("X"),
                           discount.getTermination("transfer"))

        # accumulate discounted reward
        # do we really need gamma to make this all work? if it proves to be a
        # problem, could try removing it, and just use un-discounted reward.
        # we can just use the fact that the reward integrator will saturate to
        # prevent rewards from going to infinity
        discountreward = eprod.Eprod("discountreward",
                                     N * 4,
                                     1,
                                     weights=[[[1.0 / rewardradius]], [[1.0]]],
                                     oneDinput=True)
        self.addNode(discountreward)
        self.exposeTermination(discountreward.getTermination("A"), "reward")
        self.addProjection(discount.getOrigin("X"),
                           discountreward.getTermination("B"))

        reward = ef.make("reward", N * 4, 1)
        reward.addDecodedTermination("input", [[intPSC]], intPSC, False)
        reward.addDecodedTermination("feedback", [[1]], intPSC, False)
        reward.addTermination("gate",
                              [[-8] for _ in range(reward.getNodeCount())],
                              intPSC, False)
        self.addNode(reward)
        reward.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])
        self.addProjection(reward.getOrigin("X"),
                           reward.getTermination("feedback"))
        self.addProjection(discountreward.getOrigin("X"),
                           reward.getTermination("input"))
        self.addProjection(reset.getOrigin("X"), reward.getTermination("gate"))

        # weight currQ by discount
        discountcurrQ = eprod.Eprod("discountcurrQ", N * 4, 1, oneDinput=True)
        self.addNode(discountcurrQ)
        self.addProjection(currQ.getOrigin("X"),
                           discountcurrQ.getTermination("A"))
        self.addProjection(discount.getOrigin("X"),
                           discountcurrQ.getTermination("B"))

        # error calculation
        # radius of 2 since max error = maxQ + maxreward - 0 (unless we let Q
        # values go negative)
        error = ef.make("error", N * 2, [2])
        error.addDecodedTermination("currQ", [[1]], tauPSC, False)
        error.addDecodedTermination("reward", [[1]], tauPSC, False)
        error.addDecodedTermination("storeQ", [[-1]], tauPSC, False)
        self.addNode(error)
        self.addProjection(discountcurrQ.getOrigin("X"),
                           error.getTermination("currQ"))
        self.addProjection(reward.getOrigin("X"),
                           error.getTermination("reward"))
        self.addProjection(storeQ.getOrigin("X"),
                           error.getTermination("storeQ"))
        self.exposeOrigin(error.getOrigin("X"), "X")
Example #46
0
def run_flat_delivery(args, seed=None):
    """Runs the model on the delivery task with only one hierarchical level."""

    if seed is not None:
        HRLutils.set_seed(seed)
    seed = HRLutils.SEED

    net = nef.Network("run_flat_delivery")

    if "load_weights" in args and args["load_weights"] is not None:
        args["load_weights"] += "_%s" % seed

    stateN = 1200
    contextD = 2
    context_scale = 1.0
    max_state_input = 2
    actions = [("up", [0, 1]), ("right", [1, 0]),
               ("down", [0, -1]), ("left", [-1, 0])]

    # ##ENVIRONMENT

    env = deliveryenvironment.DeliveryEnvironment(
        actions, HRLutils.datafile("contextmap.bmp"),
        colormap={-16777216: "wall", -1: "floor", -256: "a", -2088896: "b"},
        imgsize=(5, 5), dx=0.001, placedev=0.5)
    net.add(env)

    print "generated", len(env.placecells), "placecells"

    # ##NAV AGENT

    enc = env.gen_encoders(stateN, contextD, context_scale)
    enc = MU.prod(enc, 1.0 / max_state_input)

    with open(HRLutils.datafile("contextbmp_evalpoints_%s.txt" % seed)) as f:
        evals = [[float(x) for x in l.split(" ")] for l in f.readlines()]

    nav_agent = smdpagent.SMDPAgent(stateN, len(env.placecells) + contextD,
                                    actions, name="NavAgent",
                                    state_encoders=enc, state_evals=evals,
                                    state_threshold=0.8, **args)
    net.add(nav_agent)

    print "agent neurons:", nav_agent.countNeurons()

    net.connect(nav_agent.getOrigin("action_output"),
                env.getTermination("action"))
    net.connect(env.getOrigin("placewcontext"),
                nav_agent.getTermination("state_input"))

    nav_term_node = terminationnode.TerminationNode(
        {terminationnode.Timer((0.6, 0.9)): None}, env, name="NavTermNode",
        contextD=2)
    net.add(nav_term_node)
    net.connect(env.getOrigin("context"),
                nav_term_node.getTermination("context"))
    net.connect(nav_term_node.getOrigin("reset"),
                nav_agent.getTermination("reset"))
    net.connect(nav_term_node.getOrigin("learn"),
                nav_agent.getTermination("learn"))
    net.connect(nav_term_node.getOrigin("reset"),
                nav_agent.getTermination("save_state"))
    net.connect(nav_term_node.getOrigin("reset"),
                nav_agent.getTermination("save_action"))

    reward_relay = net.make("reward_relay", 1, 1, mode="direct")
    reward_relay.fixMode()
    net.connect(env.getOrigin("reward"), reward_relay)
    net.connect(nav_term_node.getOrigin("pseudoreward"), reward_relay)
    net.connect(reward_relay, nav_agent.getTermination("reward"))

    # period to save weights (realtime, not simulation time)
    weight_save = 600.0
    HRLutils.WeightSaveThread(nav_agent.getNode("QNetwork").saveParams,
                              os.path.join("weights", "%s_%s" %
                                           (nav_agent.name, seed)),
                              weight_save).start()

    # data collection node
    data = datanode.DataNode(period=5,
                             filename=HRLutils.datafile("dataoutput_%s.txt" %
                                                        seed))
    net.add(data)
    q_net = nav_agent.getNode("QNetwork")
    data.record_avg(env.getOrigin("reward"))
    data.record_avg(q_net.getNode("actionvals").getOrigin("X"))
    data.record_sparsity(q_net.getNode("state_pop").getOrigin("AXON"))
    data.record_avg(q_net.getNode("valdiff").getOrigin("X"))
    data.record_avg(nav_agent.getNode("ErrorNetwork").getOrigin("error"))

#    net.add_to_nengo()
#    net.run(10000)
    net.view()
Example #47
0
    def __init__(self,
                 num_actions,
                 Qradius=1.0,
                 rewardradius=1.0,
                 discount=0.3):
        """Builds the ErrorNetwork.

        :param num_actions: the number of actions available to the system
        :param Qradius: expected radius of Q values
        :param rewardradius: expected radius of reward signal
        :param discount: discount factor
        """

        self.name = "ErrorNetwork"
        net = nef.Network(self, seed=HRLutils.SEED, quick=False)

        N = 50
        tauPSC = 0.007
        errorcap = 0.1  # soft cap on error magnitude (large errors seem to
        # cause problems with overly-generalizing the learning)

        # set up relays
        vals_relay = net.make("vals_relay", 1, num_actions, mode="direct")
        vals_relay.fixMode()
        vals_relay.addDecodedTermination("input", MU.I(num_actions), 0.001,
                                         False)

        old_vals_relay = net.make("old_vals_relay",
                                  1,
                                  num_actions,
                                  mode="direct")
        old_vals_relay.fixMode()
        old_vals_relay.addDecodedTermination("input", MU.I(num_actions), 0.001,
                                             False)

        curr_bg_relay = net.make("curr_bg_relay",
                                 1,
                                 num_actions,
                                 mode="direct")
        curr_bg_relay.fixMode()
        curr_bg_relay.addDecodedTermination("input", MU.I(num_actions), 0.001,
                                            False)

        saved_bg_relay = net.make("saved_bg_relay",
                                  1,
                                  num_actions,
                                  mode="direct")
        saved_bg_relay.fixMode()
        saved_bg_relay.addDecodedTermination("input", MU.I(num_actions), 0.001,
                                             False)

        # select out only the currently chosen Q value
        gatedQ = net.make_array("gatedQ",
                                N * 2,
                                num_actions,
                                node_factory=HRLutils.node_fac(),
                                radius=Qradius)
        gatedQ.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

        net.connect(vals_relay, gatedQ, pstc=tauPSC)

        net.connect(
            curr_bg_relay,
            gatedQ,
            transform=[[-3 if i != k else 0 for k in range(num_actions)]
                       for i in range(num_actions)
                       for _ in range(gatedQ.getNeurons() / num_actions)],
            pstc=tauPSC)

        currQ = net.make("currQ", 1, 1, mode="direct")
        currQ.fixMode()
        net.connect(gatedQ,
                    currQ,
                    transform=[[1 for _ in range(num_actions)]],
                    pstc=0.001)

        # select out only the previously chosen Q value
        gatedstoreQ = net.make_array("gatedstoreQ",
                                     N * 2,
                                     num_actions,
                                     node_factory=HRLutils.node_fac(),
                                     radius=Qradius)
        gatedstoreQ.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

        net.connect(old_vals_relay, gatedstoreQ, pstc=tauPSC)

        net.connect(
            saved_bg_relay,
            gatedstoreQ,
            transform=[[-3 if i != k else 0 for k in range(num_actions)]
                       for i in range(num_actions)
                       for _ in range(gatedstoreQ.getNeurons() / num_actions)],
            pstc=tauPSC)

        storeQ = net.make("storeQ", 1, 1, mode="direct")
        storeQ.fixMode()
        net.connect(gatedstoreQ,
                    storeQ,
                    transform=[[1 for _ in range(num_actions)]],
                    pstc=0.001)

        # create error calculation network
        error = errorcalc2.ErrorCalc2(discount,
                                      rewardradius=rewardradius,
                                      Qradius=Qradius)
        net.add(error)

        net.connect(currQ, error.getTermination("currQ"))
        net.connect(storeQ, error.getTermination("storeQ"))

        # gate error by learning signal and saved BG output (we only want error
        # when the system is supposed to be learning, and we only want error
        # related to the action that was selected)
        gatederror = net.make_array("gatederror",
                                    N * 2,
                                    num_actions,
                                    radius=errorcap,
                                    node_factory=HRLutils.node_fac())
        gatederror.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

        net.connect(error,
                    gatederror,
                    transform=[[1.0 / Qradius] for _ in range(num_actions)],
                    pstc=tauPSC)
        # scale the error by Qradius, so that we don't get super huge errors
        # (causes problems with the gating)

        learninggate = net.make("learninggate",
                                N,
                                1,
                                node_factory=HRLutils.node_fac())
        learninggate.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])
        learninggate.addTermination("gate", [[-10] for _ in range(N)], tauPSC,
                                    False)

        net.connect(learninggate,
                    gatederror,
                    func=lambda x: [1.0],
                    transform=[[-12] for _ in range(gatederror.getNeurons())],
                    pstc=tauPSC)

        net.connect(
            saved_bg_relay,
            gatederror,
            transform=[[-12 if i != k else 0 for k in range(num_actions)]
                       for i in range(num_actions)
                       for _ in range(gatederror.getNeurons() / num_actions)],
            pstc=tauPSC)

        # add a positive bias to the error anywhere the Q values are negative
        # (to stop Q values from getting too negative, which causes problems
        # with the action selection)
        posbias = positivebias.PositiveBias(N, num_actions)
        net.add(posbias)
        net.connect(old_vals_relay, posbias.getTermination("input"))
        net.connect(learninggate,
                    posbias.getTermination("learn"),
                    func=lambda x: [1.0])

        biasederror = net.make("biasederror", 1, num_actions, mode="direct")
        biasederror.fixMode()
        net.connect(gatederror, biasederror, pstc=0.001)
        net.connect(posbias, biasederror, pstc=0.001)

        self.exposeTermination(curr_bg_relay.getTermination("input"),
                               "curr_bg_input")
        self.exposeTermination(saved_bg_relay.getTermination("input"),
                               "saved_bg_input")
        self.exposeTermination(vals_relay.getTermination("input"), "vals")
        self.exposeTermination(old_vals_relay.getTermination("input"),
                               "old_vals")
        self.exposeTermination(error.getTermination("reward"), "reward")
        self.exposeTermination(error.getTermination("reset"), "reset")
        self.exposeTermination(learninggate.getTermination("gate"), "learn")
        self.exposeOrigin(biasederror.getOrigin("X"), "error")
Example #48
0
    def __init__(self,
                 name,
                 N,
                 stateN,
                 actions,
                 learningrate,
                 Qradius=1.0,
                 init_decoders=None):
        """Build ActionValues network.

        :param name: name of Network
        :param N: base number of neurons
        :param stateN: number of neurons in state population
        :param actions: actions available to the system
            :type actions: list of tuples (action_name,action_vector)
        :param learningrate: learning rate for PES rule
        :param Qradius: expected radius of Q values
        :param init_decoders: if specified, will be used to initialize the
            connection weights to whatever function is specified by decoders
        """

        self.name = name
        net = nef.Network(self, seed=HRLutils.SEED, quick=False)

        self.N = N
        self.learningrate = learningrate
        self.supervision = 1.0  # don't use the unsupervised stuff at all

        self.tauPSC = 0.007

        modterms = []
        learnterms = []

        # relays
        output = net.make("output", 1, len(actions), mode="direct")
        output.fixMode()

        for i, action in enumerate(actions):
            # create one population corresponding to each action
            act_pop = net.make("action_" + action[0],
                               self.N * 4,
                               1,
                               node_factory=HRLutils.node_fac())
            act_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

            # add error termination
            modterm = act_pop.addDecodedTermination(
                "error", [[0 if j != i else 1 for j in range(len(actions))]],
                0.005, True)
            # set modulatory transform so that it selects one dimension of
            # the error signal

            # create learning termination
            if init_decoders is not None:
                weights = MU.prod(act_pop.getEncoders(),
                                  MU.transpose(init_decoders))
            else:
                weights = [[
                    random.uniform(-1e-3, 1e-3) for j in range(stateN)
                ] for i in range(act_pop.getNeurons())]
            learningterm = act_pop.addHPESTermination("learning", weights,
                                                      0.005, False, None)

            # initialize the learning rule
            net.learn(act_pop,
                      learningterm,
                      modterm,
                      rate=self.learningrate,
                      supervisionRatio=self.supervision)

            # connect each action back to output relay
            net.connect(act_pop.getOrigin("X"),
                        output,
                        transform=[[0] if j != i else [Qradius]
                                   for j in range(len(actions))],
                        pstc=0.001)
            # note, we learn all the Q values with radius 1, then just
            # multiply by the desired Q radius here

            modterms += [modterm]
            learnterms += [learningterm]

        # use EnsembleTerminations to group the individual action terminations
        # into one multi-dimensional termination
        self.exposeTermination(EnsembleTermination(self, "state", learnterms),
                               "state")
        self.exposeTermination(EnsembleTermination(self, "error", modterms),
                               "error")

        self.exposeOrigin(output.getOrigin("X"), "X")
Example #49
0
def run_flat_delivery(args, seed=None):
    """Runs the model on the delivery task with only one hierarchical level."""

    if seed is not None:
        HRLutils.set_seed(seed)
    seed = HRLutils.SEED

    net = nef.Network("run_flat_delivery")

    if "load_weights" in args and args["load_weights"] is not None:
        args["load_weights"] += "_%s" % seed

    stateN = 1200
    contextD = 2
    context_scale = 1.0
    max_state_input = 2
    actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]),
               ("left", [-1, 0])]

    # ##ENVIRONMENT

    env = deliveryenvironment.DeliveryEnvironment(
        actions,
        HRLutils.datafile("contextmap.bmp"),
        colormap={
            -16777216: "wall",
            -1: "floor",
            -256: "a",
            -2088896: "b"
        },
        imgsize=(5, 5),
        dx=0.001,
        placedev=0.5)
    net.add(env)

    print "generated", len(env.placecells), "placecells"

    # ##NAV AGENT

    enc = env.gen_encoders(stateN, contextD, context_scale)
    enc = MU.prod(enc, 1.0 / max_state_input)

    with open(HRLutils.datafile("contextbmp_evalpoints_%s.txt" % seed)) as f:
        evals = [[float(x) for x in l.split(" ")] for l in f.readlines()]

    nav_agent = smdpagent.SMDPAgent(stateN,
                                    len(env.placecells) + contextD,
                                    actions,
                                    name="NavAgent",
                                    state_encoders=enc,
                                    state_evals=evals,
                                    state_threshold=0.8,
                                    **args)
    net.add(nav_agent)

    print "agent neurons:", nav_agent.countNeurons()

    net.connect(nav_agent.getOrigin("action_output"),
                env.getTermination("action"))
    net.connect(env.getOrigin("placewcontext"),
                nav_agent.getTermination("state_input"))

    nav_term_node = terminationnode.TerminationNode(
        {terminationnode.Timer((0.6, 0.9)): None},
        env,
        name="NavTermNode",
        contextD=2)
    net.add(nav_term_node)
    net.connect(env.getOrigin("context"),
                nav_term_node.getTermination("context"))
    net.connect(nav_term_node.getOrigin("reset"),
                nav_agent.getTermination("reset"))
    net.connect(nav_term_node.getOrigin("learn"),
                nav_agent.getTermination("learn"))
    net.connect(nav_term_node.getOrigin("reset"),
                nav_agent.getTermination("save_state"))
    net.connect(nav_term_node.getOrigin("reset"),
                nav_agent.getTermination("save_action"))

    reward_relay = net.make("reward_relay", 1, 1, mode="direct")
    reward_relay.fixMode()
    net.connect(env.getOrigin("reward"), reward_relay)
    net.connect(nav_term_node.getOrigin("pseudoreward"), reward_relay)
    net.connect(reward_relay, nav_agent.getTermination("reward"))

    # period to save weights (realtime, not simulation time)
    weight_save = 600.0
    HRLutils.WeightSaveThread(
        nav_agent.getNode("QNetwork").saveParams,
        os.path.join("weights", "%s_%s" % (nav_agent.name, seed)),
        weight_save).start()

    # data collection node
    data = datanode.DataNode(period=5,
                             filename=HRLutils.datafile("dataoutput_%s.txt" %
                                                        seed))
    net.add(data)
    q_net = nav_agent.getNode("QNetwork")
    data.record_avg(env.getOrigin("reward"))
    data.record_avg(q_net.getNode("actionvals").getOrigin("X"))
    data.record_sparsity(q_net.getNode("state_pop").getOrigin("AXON"))
    data.record_avg(q_net.getNode("valdiff").getOrigin("X"))
    data.record_avg(nav_agent.getNode("ErrorNetwork").getOrigin("error"))

    #    net.add_to_nengo()
    #    net.run(10000)
    net.view()
Example #50
0
    def __init__(self,
                 stateN,
                 stateD,
                 state_encoders,
                 actions,
                 learningrate,
                 stateradius=1.0,
                 Qradius=1.0,
                 load_weights=None,
                 state_evals=None,
                 state_threshold=(0.0, 1.0),
                 statediff_threshold=0.2,
                 init_Qs=None):
        """Builds the QNetwork.

        :param stateN: number of neurons to use to represent state
        :param stateD: dimension of state vector
        :param state_encoders: encoders to use for neurons in state population
        :param actions: actions available to the system
            :type actions: list of tuples (action_name,action_vector)
        :param learningrate: learningrate for action value learning rule
        :param stateradius: expected radius of state values
        :param Qradius: expected radius of Q values
        :param load_weights: filename to load Q value weights from
        :param state_evals: evaluation points to use for state population.
            This is used when initializing the Q values (may be necessary if
            the input states don't tend to fall in the hypersphere).
        :param state_threshold: threshold range of state neurons
        :param statediff_threshold: maximum state difference for dual training
        :param init_Qs: initial Q values
        """

        self.name = "QNetwork"
        net = nef.Network(self, seed=HRLutils.SEED, quick=False)

        N = 50
        tauPSC = 0.007
        num_actions = len(actions)
        init_Qs = [0.2] * num_actions if init_Qs is None else init_Qs

        # if True, use neuron--neuron weight learning, otherwise, use decoder
        # learning
        self.neuron_learning = False

        # set up relays
        state_relay = net.make("state_relay", 1, stateD, mode="direct")
        state_relay.fixMode()
        state_relay.addDecodedTermination("input", MU.I(stateD), 0.001, False)

        # create state population
        state_fac = HRLutils.node_fac()
        if isinstance(state_threshold, (float, int)):
            state_threshold = (state_threshold, 1.0)
        state_fac.setIntercept(
            IndicatorPDF(state_threshold[0], state_threshold[1]))

        state_pop = net.make("state_pop",
                             stateN,
                             stateD,
                             radius=stateradius,
                             node_factory=state_fac,
                             encoders=state_encoders,
                             eval_points=state_evals)
        state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

        net.connect(state_relay, state_pop, pstc=tauPSC)

        # store the state value (used to drive population encoding previous
        # state)
        saved_state = memory.Memory("saved_state",
                                    N * 4,
                                    stateD,
                                    inputscale=50,
                                    radius=stateradius,
                                    direct_storage=True)
        net.add(saved_state)

        net.connect(state_relay, saved_state.getTermination("target"))

        # create population representing previous state
        old_state_pop = net.make("old_state_pop",
                                 stateN,
                                 stateD,
                                 radius=stateradius,
                                 node_factory=state_fac,
                                 encoders=state_encoders,
                                 eval_points=state_evals)
        old_state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

        net.connect(saved_state, old_state_pop, pstc=tauPSC)

        # set up action nodes
        if self.neuron_learning:
            # use ActionValues network to compute Q values

            # current Q values
            decoders = state_pop.addDecodedOrigin(
                "init_decoders", [ConstantFunction(stateD, init_Qs)],
                "AXON").getDecoders()
            actionvals = actionvalues.ActionValues("actionvals",
                                                   N,
                                                   stateN,
                                                   actions,
                                                   learningrate,
                                                   Qradius=Qradius,
                                                   init_decoders=decoders)
            net.add(actionvals)

            net.connect(state_pop.getOrigin("AXON"),
                        actionvals.getTermination("state"))

            # Q values of previous state
            decoders = old_state_pop.addDecodedOrigin(
                "init_decoders", [ConstantFunction(stateD, init_Qs)],
                "AXON").getDecoders()
            old_actionvals = actionvalues.ActionValues("old_actionvals",
                                                       N,
                                                       stateN,
                                                       actions,
                                                       learningrate,
                                                       Qradius=Qradius,
                                                       init_decoders=decoders)
            net.add(old_actionvals)

            net.connect(old_state_pop.getOrigin("AXON"),
                        old_actionvals.getTermination("state"))
        else:
            # just use decoder on state population to compute Q values

            # current Q values
            origin = state_pop.addDecodedOrigin("vals", [
                ConstantFunction(num_actions, init_Qs[i])
                for i in range(num_actions)
            ], "AXON")
            state_dlnode = decoderlearningnode.DecoderLearningNode(
                state_pop,
                origin,
                learningrate,
                num_actions,
                name="state_learningnode")
            net.add(state_dlnode)

            # just a little relay node, so that things match up for the rest of
            # the script when you have the neuron -- neuron learning
            actionvals = net.make("actionvals", 1, num_actions, mode="direct")
            actionvals.fixMode()
            net.connect(origin, actionvals, pstc=0.001)

            # Q values of previous state
            origin = old_state_pop.addDecodedOrigin("vals", [
                ConstantFunction(num_actions, init_Qs[i])
                for i in range(num_actions)
            ], "AXON")
            old_state_dlnode = decoderlearningnode.DecoderLearningNode(
                old_state_pop,
                origin,
                learningrate,
                num_actions,
                name="old_state_learningnode")
            net.add(old_state_dlnode)

            old_actionvals = net.make("old_actionvals",
                                      1,
                                      num_actions,
                                      mode="direct")
            old_actionvals.fixMode()
            net.connect(origin, old_actionvals, pstc=0.001)

        if load_weights is not None:
            self.loadParams(load_weights)

        # find error between old_actionvals and actionvals (this will be used
        # to drive learning on the new actionvals)
        valdiff = net.make_array("valdiff",
                                 N,
                                 num_actions,
                                 node_factory=HRLutils.node_fac())
        # doubling the values to get a bigger error signal
        net.connect(old_actionvals,
                    valdiff,
                    transform=MU.diag([2] * num_actions),
                    pstc=tauPSC)
        net.connect(actionvals,
                    valdiff,
                    transform=MU.diag([-2] * num_actions),
                    pstc=tauPSC)

        # calculate diff between curr_state and saved_state and use that to
        # gate valdiff (we only want to train the curr state based on previous
        # state when the two have similar values)
        # note: threshold > 0 so that there is a deadzone in the middle (when
        # the states are similar) where there will be no output inhibition
        statediff = net.make_array("statediff",
                                   N,
                                   stateD,
                                   intercept=(statediff_threshold, 1))

        net.connect(state_relay, statediff, pstc=tauPSC)
        net.connect(saved_state,
                    statediff,
                    transform=MU.diag([-1] * stateD),
                    pstc=tauPSC)

        net.connect(statediff,
                    valdiff,
                    func=lambda x: [abs(v) for v in x],
                    transform=[[-10] * stateD
                               for _ in range(valdiff.getNeurons())],
                    pstc=tauPSC)

        # connect up valdiff to the error signal for current Q values, and
        # expose the error signal for the previous Q values to the external
        # error
        if self.neuron_learning:
            net.connect(valdiff, actionvals.getTermination("error"))
            self.exposeTermination(old_actionvals.getTermination("error"),
                                   "error")
        else:
            net.connect(valdiff, state_dlnode.getTermination("error"))
            self.exposeTermination(old_state_dlnode.getTermination("error"),
                                   "error")

        self.exposeTermination(state_relay.getTermination("input"), "state")
        self.exposeTermination(saved_state.getTermination("transfer"),
                               "save_state")
        self.exposeOrigin(actionvals.getOrigin("X"), "vals")
        self.exposeOrigin(old_actionvals.getOrigin("X"), "old_vals")
Example #51
0
def run_badreenvironment(nav_args,
                         ctrl_args,
                         bias=0.0,
                         seed=None,
                         flat=False,
                         label="tmp"):
    """Runs the model on the Badre et al. (2010) task."""

    if seed is not None:
        HRLutils.set_seed(seed)
    seed = HRLutils.SEED

    net = nef.Network("run_badreenvironment")

    env = badreenvironment.BadreEnvironment(flat=flat)
    net.add(env)

    # ##NAV AGENT
    stateN = 500
    max_state_input = 3
    enc = env.gen_encoders(stateN, 0, 0.0)

    # generate evaluation points
    orientations = MU.I(env.num_orientations)
    shapes = MU.I(env.num_shapes)
    colours = MU.I(env.num_colours)
    evals = (
        list(MU.diag([3 for _ in range(env.stateD)])) +
        [o + s + c for o in orientations for s in shapes for c in colours])

    # create lower level
    nav_agent = smdpagent.SMDPAgent(stateN,
                                    env.stateD,
                                    env.actions,
                                    name="NavAgent",
                                    stateradius=max_state_input,
                                    state_encoders=enc,
                                    state_evals=evals,
                                    discount=0.5,
                                    **nav_args)
    net.add(nav_agent)

    print "agent neurons:", nav_agent.countNeurons()

    # actions terminate on fixed schedule (aligned with environment)
    nav_term_node = terminationnode.TerminationNode(
        {terminationnode.Timer((0.6, 0.6)): None},
        env,
        name="NavTermNode",
        state_delay=0.1,
        reset_delay=0.05,
        reset_interval=0.1)
    net.add(nav_term_node)

    net.connect(nav_term_node.getOrigin("reset"),
                nav_agent.getTermination("reset"))
    net.connect(nav_term_node.getOrigin("learn"),
                nav_agent.getTermination("learn"))
    net.connect(nav_term_node.getOrigin("reset"),
                nav_agent.getTermination("save_state"))
    net.connect(nav_term_node.getOrigin("reset"),
                nav_agent.getTermination("save_action"))

    net.connect(nav_agent.getOrigin("action_output"),
                env.getTermination("action"))

    # ##CTRL AGENT
    stateN = 500
    enc = RandomHypersphereVG().genVectors(stateN, env.stateD)
    actions = [("shape", [0, 1]), ("orientation", [1, 0]), ("null", [0, 0])]
    ctrl_agent = smdpagent.SMDPAgent(stateN,
                                     env.stateD,
                                     actions,
                                     name="CtrlAgent",
                                     state_encoders=enc,
                                     stateradius=max_state_input,
                                     state_evals=evals,
                                     discount=0.4,
                                     **ctrl_args)
    net.add(ctrl_agent)

    print "agent neurons:", ctrl_agent.countNeurons()

    net.connect(env.getOrigin("state"),
                ctrl_agent.getTermination("state_input"))

    ctrl_term_node = terminationnode.TerminationNode(
        {terminationnode.Timer((0.6, 0.6)): None},
        env,
        name="CtrlTermNode",
        state_delay=0.1,
        reset_delay=0.05,
        reset_interval=0.1)
    net.add(ctrl_term_node)

    net.connect(ctrl_term_node.getOrigin("reset"),
                ctrl_agent.getTermination("reset"))
    net.connect(ctrl_term_node.getOrigin("learn"),
                ctrl_agent.getTermination("learn"))
    net.connect(ctrl_term_node.getOrigin("reset"),
                ctrl_agent.getTermination("save_state"))
    net.connect(ctrl_term_node.getOrigin("reset"),
                ctrl_agent.getTermination("save_action"))

    # ctrl gets a slight bonus if it selects a rule (as opposed to null), to
    # encourage it to not just pick null all the time
    reward_relay = net.make("reward_relay", 1, 3, mode="direct")
    reward_relay.fixMode()
    net.connect(env.getOrigin("reward"),
                reward_relay,
                transform=[[1], [0], [0]])
    net.connect(ctrl_agent.getOrigin("action_output"),
                reward_relay,
                transform=[[0, 0], [1, 0], [0, 1]])

    net.connect(reward_relay,
                ctrl_agent.getTermination("reward"),
                func=lambda x: ((x[0] + bias * abs(x[0]))
                                if x[1] + x[2] > 0.5 else x[0]),
                origin_name="ctrl_reward")

    # ideal reward function (for testing)
    #     def ctrl_reward_func(x):
    #         if abs(x[0]) < 0.5:
    #             return 0.0
    #
    #         if flat:
    #             return 1.5 if x[1] + x[2] < 0.5 else -1.5
    #         else:
    #             if x[1] + x[2] < 0.5:
    #                 return -1.5
    #             if [round(a) for a in env.state[-2:]] == [round(b)
    #                                                       for b in x[1:]]:
    #                 return 1.5
    #             else:
    #                 return -1.5
    #     net.connect(reward_relay, ctrl_agent.getTermination("reward"),
    #                 func=ctrl_reward_func)

    # nav rewarded for picking ctrl target
    def nav_reward_func(x):
        if abs(x[0]) < 0.5 or env.action is None:
            return 0.0

        if x[1] + x[2] < 0.5:
            return x[0]

        if x[1] > x[2]:
            return (1.5 if env.action[1] == env.state[:env.num_orientations]
                    else -1.5)
        else:
            return (1.5 if env.action[1]
                    == env.state[env.num_orientations:-env.num_colours] else
                    -1.5)

    net.connect(reward_relay,
                nav_agent.getTermination("reward"),
                func=nav_reward_func)

    # state for navagent controlled by ctrlagent
    ctrl_state_inhib = net.make_array("ctrl_state_inhib",
                                      50,
                                      env.stateD,
                                      radius=2,
                                      mode=HRLutils.SIMULATION_MODE)
    ctrl_state_inhib.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

    inhib_matrix = [[0, -5]] * 50 * env.num_orientations + \
                   [[-5, 0]] * 50 * env.num_shapes + \
                   [[-5, -5]] * 50 * env.num_colours

    # ctrl output inhibits all the non-selected aspects of the state
    net.connect(env.getOrigin("state"), ctrl_state_inhib)
    net.connect(ctrl_agent.getOrigin("action_output"),
                ctrl_state_inhib,
                transform=inhib_matrix)

    # also give a boost to the selected aspects (so that neurons are roughly
    # equally activated).
    def boost_func(x):
        if x[0] > 0.5:
            return [3 * v for v in x[1:]]
        else:
            return x[1:]

    boost = net.make("boost", 1, 1 + env.stateD, mode="direct")
    boost.fixMode()
    net.connect(ctrl_state_inhib,
                boost,
                transform=([[0 for _ in range(env.stateD)]] +
                           list(MU.I(env.stateD))))
    net.connect(ctrl_agent.getOrigin("action_output"),
                boost,
                transform=[[1, 1]] + [[0, 0] for _ in range(env.stateD)])

    net.connect(boost,
                nav_agent.getTermination("state_input"),
                func=boost_func)

    # save weights
    weight_save = 1.0  # period to save weights (realtime, not simulation time)
    threads = [
        HRLutils.WeightSaveThread(
            nav_agent.getNode("QNetwork").saveParams,
            os.path.join("weights", "%s_%s" % (nav_agent.name, seed)),
            weight_save),
        HRLutils.WeightSaveThread(
            ctrl_agent.getNode("QNetwork").saveParams,
            os.path.join("weights", "%s_%s" % (ctrl_agent.name, seed)),
            weight_save)
    ]
    for t in threads:
        t.start()

    # data collection node
    data = datanode.DataNode(
        period=1,
        filename=HRLutils.datafile("dataoutput_%s.txt" % label),
        header="%s %s %s %s %s" % (nav_args, ctrl_args, bias, seed, flat))
    print "saving data to", data.filename
    print "header", data.header
    net.add(data)
    nav_q = nav_agent.getNode("QNetwork")
    ctrl_q = ctrl_agent.getNode("QNetwork")
    ctrl_bg = ctrl_agent.getNode("BGNetwork").getNode("weight_actions")
    data.record_avg(env.getOrigin("reward"))
    data.record_avg(ctrl_q.getNode("actionvals").getOrigin("X"))
    data.record_sparsity(ctrl_q.getNode("state_pop").getOrigin("AXON"))
    data.record_sparsity(nav_q.getNode("state_pop").getOrigin("AXON"))
    data.record_avg(ctrl_q.getNode("valdiff").getOrigin("X"))
    data.record_avg(ctrl_agent.getNode("ErrorNetwork").getOrigin("error"))
    data.record_avg(ctrl_bg.getNode("0").getOrigin("AXON"))
    data.record_avg(ctrl_bg.getNode("1").getOrigin("AXON"))
    data.record(env.getOrigin("score"))

    #     net.add_to_nengo()
    #     net.network.simulator.run(0, 300, 0.001)
    net.view()

    for t in threads:
        t.stop()
Example #52
0
    def __init__(self,
                 name,
                 N,
                 d,
                 scale=1.0,
                 weights=None,
                 maxinput=1.0,
                 oneDinput=False):
        # scale is a scale on the output of the multiplication
        # output = (input1.*input2)*scale

        # weights are optional matrices applied to each input
        # output = (C1*input1 .* C2*input2)*scale

        # maxinput is the maximum expected value of any dimension of the
        # inputs. this is used to scale the inputs internally so that the
        # length of the vectors in the intermediate populations are not
        # too small (which results in a lot of noise in the calculations)

        # oneDinput indicates that the second input is one dimensional, and is
        # just a scale on the first input rather than an element-wise product

        self.name = name
        tauPSC = 0.007

        # the size of the intermediate populations
        smallN = int(math.ceil(float(N) / d))

        # the maximum value of the vectors represented by the intermediate
        # populations. the vector is at most [maxinput maxinput], so the length
        # of that is sqrt(maxinput**2 + maxinput**2)
        maxlength = math.sqrt(2 * maxinput**2)

        if weights is not None and len(weights) != 2:
            print "Warning, other than 2 matrices given to eprod"

        if weights is None:
            weights = [MU.I(d), MU.I(d)]

        inputd = len(weights[0][0])

        ef = HRLutils.defaultEnsembleFactory()

        # create input populations
        in1 = ef.make("in1", 1, inputd)
        in1.addDecodedTermination("input", MU.I(inputd), 0.001, False)
        self.addNode(in1)
        in1.setMode(SimulationMode.DIRECT)  # since this is just a relay
        in1.fixMode()

        in2 = ef.make("in2", 1, inputd)
        if not oneDinput:
            in2.addDecodedTermination("input", MU.I(inputd), 0.001, False)
        else:
            # if it is a 1-D input we just expand it to a full vector of that
            # value so that we can treat it as an element-wise product
            in2.addDecodedTermination("input", [[1] for i in range(inputd)],
                                      0.001, False)
        self.addNode(in2)
        in2.setMode(SimulationMode.DIRECT)  # since this is just a relay
        in2.fixMode()

        # ensemble for intermediate populations
        multef = NEFEnsembleFactoryImpl()
        multef.nodeFactory.tauRC = 0.05
        multef.nodeFactory.tauRef = 0.002
        multef.nodeFactory.maxRate = IndicatorPDF(200, 500)
        multef.nodeFactory.intercept = IndicatorPDF(-1, 1)
        multef.encoderFactory = vectorgenerators.MultiplicationVectorGenerator(
        )
        multef.beQuiet()

        result = ef.make("result", 1, d)
        result.setMode(SimulationMode.DIRECT)  # since this is just a relay
        result.fixMode()
        self.addNode(result)

        resultTerm = [[0] for _ in range(d)]
        zeros = [0 for _ in range(inputd)]

        for e in range(d):
            # create a 2D population for each input dimension which will
            # combine the components from one dimension of each of the input
            # populations
            mpop = multef.make('mpop_' + str(e), smallN, 2)

            # make two connection that will select one component from each of
            # the input pops
            # we divide by maxlength to ensure that the maximum length of the
            # 2D vector is 1
            # remember that (for some reason) the convention in Nengo is that
            # the input matrices are transpose of what they would be
            # mathematically
            mpop.addDecodedTermination('a',
                                       [[(1.0 / maxlength) * weights[0][e][i]
                                         for i in range(inputd)], zeros],
                                       tauPSC, False)
            mpop.addDecodedTermination('b', [
                zeros,
                [(1.0 / maxlength) * weights[1][e][i] for i in range(inputd)]
            ], tauPSC, False)

            # multiply the two selected components together
            mpop.addDecodedOrigin("output", [PostfixFunction('x0*x1', 2)],
                                  "AXON")

            self.addNode(mpop)
            self.addProjection(in1.getOrigin('X'), mpop.getTermination('a'))
            self.addProjection(in2.getOrigin('X'), mpop.getTermination('b'))

            # combine the 1D results back into one vector.
            # we scaled each input by 1/maxlength, then multiplied them
            # together for a total scale of 1/maxlength**2, so to undo we
            # multiply by maxlength**2
            resultTerm[e] = [maxlength**2 * scale]
            result.addDecodedTermination('in_' + str(e), resultTerm, 0.001,
                                         False)
            resultTerm[e] = [0]

            self.addProjection(mpop.getOrigin('output'),
                               result.getTermination('in_' + str(e)))

        self.exposeTermination(in1.getTermination("input"), "A")
        self.exposeTermination(in2.getTermination("input"), "B")
        self.exposeOrigin(result.getOrigin("X"), "X")
Example #53
0
def gen_evalpoints(filename, seed=None):
    """Runs an environment for some length of time and records state values,
    to be used as eval points for agent initialization.

    :param filename: name of file in which to save eval points
    :param seed: random seed
    """

    if seed is not None:
        HRLutils.set_seed(seed)
    seed = HRLutils.SEED

    net = nef.Network("gen_evalpoints")

    contextD = 2
    actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]),
               ("left", [-1, 0])]

    rewards = {"a": 1, "b": 1}

    env = contextenvironment.ContextEnvironment(
        actions,
        HRLutils.datafile("contextmap.bmp"),
        contextD,
        rewards,
        imgsize=(5, 5),
        dx=0.001,
        placedev=0.5,
        colormap={
            -16777216: "wall",
            -1: "floor",
            -256: "a",
            -2088896: "b"
        })

    net.add(env)

    stateD = len(env.placecells) + contextD
    actions = env.actions
    actionD = len(actions)

    class EvalRecorder(nef.SimpleNode):
        def __init__(self, evalfile):
            self.action = actions[0]
            self.evalpoints = []
            self.evalfile = evalfile

            nef.SimpleNode.__init__(self, "EvalRecorder")

        def tick(self):
            if self.t % 0.1 < 0.001:
                self.evalpoints += [self.state]

            if self.t % 10.0 < 0.001:
                if len(self.evalpoints) > 10000:
                    self.evalpoints = self.evalpoints[len(self.evalpoints) -
                                                      10000:]

                with open(self.evalfile, "w") as f:
                    f.write("\n".join([
                        " ".join([str(x) for x in e]) for e in self.evalpoints
                    ]))

        def termination_state(self, x, dimensions=stateD):
            self.state = x

        def termination_action_in(self, x, dimensions=actionD):
            self.action = actions[x.index(max(x))]

        def origin_action_out(self):
            return self.action[1]

    em = EvalRecorder(HRLutils.datafile("%s_%s.txt" % (filename, seed)))
    net.add(em)

    net.connect(em.getOrigin("action_out"), env.getTermination("action"))
    net.connect(env.getOrigin("optimal_move"), em.getTermination("action_in"))
    net.connect(env.getOrigin("placewcontext"), em.getTermination("state"))

    #     net.add_to_nengo()
    net.run(10)
Example #54
0
    def __init__(self, discount, rewardradius=1.0, Qradius=1.0):
        """Builds the ErrorCalc2 network.

        :param discount: discount factor, controls rate of integration
        :param rewardradius: expected radius of reward value
        :param Qradius: expected radius of Q values
        """

        self.name = "ErrorCalc"
        net = nef.Network(self, seed=HRLutils.SEED, quick=False)

        tauPSC = 0.007
        intPSC = 0.1
        N = 50

        # relay for current Q input
        currQ = net.make("currQ", 1, 1, node_factory=HRLutils.node_fac(), mode="direct",
                         radius=Qradius)
        currQ.fixMode()
        currQ.addDecodedTermination("input", [[1]], 0.001, False)

        # input population for resetting the network
        reset_nodefac = HRLutils.node_fac()
        reset_nodefac.setIntercept(IndicatorPDF(0.3, 1.0))
        reset = net.make("reset", N, 1, encoders=[[1]], node_factory=reset_nodefac)
        reset.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])
            # this population will begin outputting a value once the reset
            # signal exceeds the threshold, and that output will then be
            # used to reset the rest of the network

        reset.addDecodedTermination("input", [[1]], tauPSC, False)

        # relay for stored previous value of Q
        storeQ = net.make("storeQ", 1, 1, node_factory=HRLutils.node_fac(), mode="direct",
                          radius=Qradius)
        storeQ.fixMode()
        storeQ.addDecodedTermination("input", [[1]], 0.001, False)

        #calculate "discount" by integrating output of storeQ   
        acc_storeQ = memory.Memory("acc_storeQ", N * 8, 1, inputscale=50)
        net.add(acc_storeQ)

        zero_input = net.make_input("zero_input", [0])

        net.connect(zero_input, acc_storeQ.getTermination("target"))
        net.connect(reset, acc_storeQ.getTermination("transfer"))

        # threshold storeQ value so it won't go below zero.  that is, if we have
        # negative Q values, we don't want to have a negative discount, or that will just drive
        # the highest (negative) Q value upwards, and it will always be selected.  negative Q
        # values are instead pushed upwards by the PositiveBias mechanism.  
        Qthresh = net.make("Qthresh", N * 2, 1, encoders=[[1]], eval_points=[[x * 0.001] for x in range(1000)],
                           radius=Qradius, intercept=(0, 1))
        net.connect(storeQ, Qthresh, pstc=tauPSC)
        net.connect(Qthresh, acc_storeQ, pstc=intPSC,
                    transform=[[discount * intPSC]], func=lambda x: max(x[0], 0.0))

        # accumulate  reward
        reward = memory.Memory("reward", N * 4, 1, radius=rewardradius, inputscale=50)
        net.add(reward)

        reward.addDecodedTermination("input", [[intPSC]], intPSC, False)

        net.connect(zero_input, reward.getTermination("target"))
        net.connect(reset, reward.getTermination("transfer"))

        # put reward, currQ, storeQ, and discount together to calculate error
        error = net.make("error", N * 2, 1, node_factory=HRLutils.node_fac())

        net.connect(currQ, error, pstc=tauPSC)
        net.connect(reward, error, pstc=tauPSC)
        net.connect(storeQ, error, pstc=tauPSC, transform=[[-1]])
        net.connect(acc_storeQ, error, pstc=tauPSC, transform=[[-1]])

        self.exposeTermination(reward.getTermination("input"), "reward")
        self.exposeTermination(reset.getTermination("input"), "reset")
        self.exposeTermination(currQ.getTermination("input"), "currQ")
        self.exposeTermination(storeQ.getTermination("input"), "storeQ")
        self.exposeOrigin(error.getOrigin("X"), "X")
Example #55
0
    def __init__(self, name, N, d, scale=1.0, weights=None, maxinput=1.0, oneDinput=False):
        # scale is a scale on the output of the multiplication
        # output = (input1.*input2)*scale

        # weights are optional matrices applied to each input
        # output = (C1*input1 .* C2*input2)*scale

        # maxinput is the maximum expected value of any dimension of the
        # inputs. this is used to scale the inputs internally so that the
        # length of the vectors in the intermediate populations are not
        # too small (which results in a lot of noise in the calculations)

        # oneDinput indicates that the second input is one dimensional, and is
        # just a scale on the first input rather than an element-wise product

        self.name = name
        tauPSC = 0.007

        # the size of the intermediate populations
        smallN = int(math.ceil(float(N) / d))

        # the maximum value of the vectors represented by the intermediate
        # populations. the vector is at most [maxinput maxinput], so the length
        # of that is sqrt(maxinput**2 + maxinput**2)
        maxlength = math.sqrt(2 * maxinput ** 2)

        if weights is not None and len(weights) != 2:
            print "Warning, other than 2 matrices given to eprod"

        if weights is None:
            weights = [MU.I(d), MU.I(d)]

        inputd = len(weights[0][0])

        ef = HRLutils.defaultEnsembleFactory()

        # create input populations
        in1 = ef.make("in1", 1, inputd)
        in1.addDecodedTermination("input", MU.I(inputd), 0.001, False)
        self.addNode(in1)
        in1.setMode(SimulationMode.DIRECT)  # since this is just a relay
        in1.fixMode()

        in2 = ef.make("in2", 1, inputd)
        if not oneDinput:
            in2.addDecodedTermination("input", MU.I(inputd), 0.001, False)
        else:
            # if it is a 1-D input we just expand it to a full vector of that
            # value so that we can treat it as an element-wise product
            in2.addDecodedTermination("input", [[1] for i in range(inputd)], 0.001, False)
        self.addNode(in2)
        in2.setMode(SimulationMode.DIRECT)  # since this is just a relay
        in2.fixMode()

        # ensemble for intermediate populations
        multef = NEFEnsembleFactoryImpl()
        multef.nodeFactory.tauRC = 0.05
        multef.nodeFactory.tauRef = 0.002
        multef.nodeFactory.maxRate = IndicatorPDF(200, 500)
        multef.nodeFactory.intercept = IndicatorPDF(-1, 1)
        multef.encoderFactory = vectorgenerators.MultiplicationVectorGenerator()
        multef.beQuiet()

        result = ef.make("result", 1, d)
        result.setMode(SimulationMode.DIRECT)  # since this is just a relay
        result.fixMode()
        self.addNode(result)

        resultTerm = [[0] for _ in range(d)]
        zeros = [0 for _ in range(inputd)]

        for e in range(d):
            # create a 2D population for each input dimension which will
            # combine the components from one dimension of each of the input
            # populations
            mpop = multef.make("mpop_" + str(e), smallN, 2)

            # make two connection that will select one component from each of
            # the input pops
            # we divide by maxlength to ensure that the maximum length of the
            # 2D vector is 1
            # remember that (for some reason) the convention in Nengo is that
            # the input matrices are transpose of what they would be
            # mathematically
            mpop.addDecodedTermination(
                "a", [[(1.0 / maxlength) * weights[0][e][i] for i in range(inputd)], zeros], tauPSC, False
            )
            mpop.addDecodedTermination(
                "b", [zeros, [(1.0 / maxlength) * weights[1][e][i] for i in range(inputd)]], tauPSC, False
            )

            # multiply the two selected components together
            mpop.addDecodedOrigin("output", [PostfixFunction("x0*x1", 2)], "AXON")

            self.addNode(mpop)
            self.addProjection(in1.getOrigin("X"), mpop.getTermination("a"))
            self.addProjection(in2.getOrigin("X"), mpop.getTermination("b"))

            # combine the 1D results back into one vector.
            # we scaled each input by 1/maxlength, then multiplied them
            # together for a total scale of 1/maxlength**2, so to undo we
            # multiply by maxlength**2
            resultTerm[e] = [maxlength ** 2 * scale]
            result.addDecodedTermination("in_" + str(e), resultTerm, 0.001, False)
            resultTerm[e] = [0]

            self.addProjection(mpop.getOrigin("output"), result.getTermination("in_" + str(e)))

        self.exposeTermination(in1.getTermination("input"), "A")
        self.exposeTermination(in2.getTermination("input"), "B")
        self.exposeOrigin(result.getOrigin("X"), "X")
Example #56
0
    def tick(self):
        if self.t > self.updatetime:
            self.scale = [0.0 for _ in range(self.action_dimension)]
            # the least visited vector could also be found by checking all the tiles and weighing them by the last visit time, instead of just looking for the minimum 
            min_list = []
            min_val = self.state_visited[0][0]
            # get min directions from the data structure
            # this is basically the gradient descent problem?
            # only if the dataset is too big to just iterate through?
            # find the global minimums O(n)
            for i in range(len(self.state_visited)):
                for j in range(len(self.state_visited[i])):
                    if(self.state_visited[i][j] == min_val):
                        min_list.append([i, j])
                    elif(self.state_visited[i][j] < min_val):
                        min_list = []
                        min_val = self.state_visited[i][j]
                        min_list.append([i, j])
            # take the average of their orientation # runtime: O(n)
            # by taking the average of the list of minimum vectors
            total = [0.0 for _ in range(self.grid_dimension)]
            for val in min_list:
                total[0] += val[0] - self.xoffset
                total[1] += val[1] - self.yoffset
            least_visited = [total[0] / len(total), total[1] / len(total)]
            # convert the average minimum vector to a scale
            # because actions are encoded to up, right, down, left
            # set the scale proportional to the time it was last visited versus the current time
            closest_min = self.agent_state
            min_state_dist = HRLutils.distance(self.agent_state, min_list[0])
            for min_loc in range(len(min_list)):
                state_dist = HRLutils.distance(self.agent_state, min_list[min_loc])
                if(state_dist < min_state_dist):
                    closest_min = min_list[min_loc]
                    min_state_dist = state_dist

            least_visited[0] += self.xoffset
            least_visited[1] += self.yoffset
            state_diff = HRLutils.difference(self.agent_state, least_visited)
            # Whats the point of state_diff? # It catches the corner case where the least visited node is the one you're already on, which may or may not be a real thing that happens

            # To summarize this noise boost operation, all it's accomplishing is discouraging the agent to go to really far places or to a place that it's visited recently
            hor_min_dist = abs(self.agent_state[0] - closest_min[0])
            hor_noise_boost = (
                (self.t - min_val) * self.time_constant
                + (1/(1+hor_min_dist)) * self.distance_constant
            ) * (state_diff[0] != 0)

            vert_min_dist = abs(self.agent_state[1] - closest_min[1])
            vert_noise_boost = (
                (self.t - min_val) * self.time_constant
                + (1/(1+vert_min_dist)) * self.distance_constant
            ) * (state_diff[1] != 0)

            # this is a bit of a clustermuffin for mapping
            if(state_diff[1] > 0):
                # go left
                print("boost left")
                self.scale[3] = hor_noise_boost
            elif(state_diff[1] < 0):
                # got right
                print("boost right")
                self.scale[1] = hor_noise_boost

            if(state_diff[0] < 0):
                # got down
                print("boost down")
                self.scale[2] = vert_noise_boost
            elif(state_diff[0] > 0):
                # got up
                print("boost up")
                self.scale[0] = vert_noise_boost

            print("Current state %s" %self.agent_state)
            print("least_visited %s" %least_visited)
            print("state_diff %s" %state_diff)
            print("scale: %s" %self.scale)
            #pdb.set_trace()
            self.state = [self.pdf.sample()[0]*self.scale[i] for i in range(len(self.state))]
            self.updatetime = self.t + self.period
Example #57
0
def gen_evalpoints(filename, seed=None):
    """Runs an environment for some length of time and records state values,
    to be used as eval points for agent initialization.

    :param filename: name of file in which to save eval points
    :param seed: random seed
    """

    if seed is not None:
        HRLutils.set_seed(seed)
    seed = HRLutils.SEED

    net = nef.Network("gen_evalpoints")

    contextD = 2
    actions = [("up", [0, 1]), ("right", [1, 0]),
               ("down", [0, -1]), ("left", [-1, 0])]

    rewards = {"a": 1, "b": 1}

    env = contextenvironment.ContextEnvironment(
        actions, HRLutils.datafile("contextmap.bmp"), contextD, rewards,
        imgsize=(5, 5), dx=0.001, placedev=0.5,
        colormap={-16777216: "wall", -1: "floor", -256: "a", -2088896: "b"})

    net.add(env)

    stateD = len(env.placecells) + contextD
    actions = env.actions
    actionD = len(actions)

    class EvalRecorder(nef.SimpleNode):
        def __init__(self, evalfile):
            self.action = actions[0]
            self.evalpoints = []
            self.evalfile = evalfile

            nef.SimpleNode.__init__(self, "EvalRecorder")

        def tick(self):
            if self.t % 0.1 < 0.001:
                self.evalpoints += [self.state]

            if self.t % 10.0 < 0.001:
                if len(self.evalpoints) > 10000:
                    self.evalpoints = self.evalpoints[len(self.evalpoints) -
                                                      10000:]

                with open(self.evalfile, "w") as f:
                    f.write("\n".join([" ".join([str(x) for x in e])
                                       for e in self.evalpoints]))

        def termination_state(self, x, dimensions=stateD):
            self.state = x

        def termination_action_in(self, x, dimensions=actionD):
            self.action = actions[x.index(max(x))]

        def origin_action_out(self):
            return self.action[1]

    em = EvalRecorder(HRLutils.datafile("%s_%s.txt" % (filename, seed)))
    net.add(em)

    net.connect(em.getOrigin("action_out"), env.getTermination("action"))
    net.connect(env.getOrigin("optimal_move"), em.getTermination("action_in"))
    net.connect(env.getOrigin("placewcontext"), em.getTermination("state"))

#     net.add_to_nengo()
    net.run(10)