예제 #1
0
파일: memory.py 프로젝트: Seanny123/HRL_1.0
    def addDecodedOrigin(self, name, funcs, origin):
        net = nef.Network(self)

        o = self.getNode("storage").addDecodedOrigin(name, funcs, origin)

        #undo radius scaling
        funcout = net.make(name, 1, self.dimension, mode="direct")
        funcout.fixMode()
        net.connect(o, funcout, pstc=0.001, transform=MU.diag([self.radius for _ in range(self.dimension)]))

        self.exposeOrigin(funcout.getOrigin("X"), name)
        return self.getOrigin(name)
예제 #2
0
    def __init__(self, name, N, d, radius=1.0, inputscale=1.0, recurweight=1.0,
                 direct_storage=False):
        """Builds the Memory network.

        :param name: name of network
        :param N: base number of neurons
        :param d: dimension of stored value
        :param radius: radius of stored value
        :param inputscale: controls how fast the stored value moves to the
            target
        :param recurweight: controls the preservation of the stored value
        :param direct_storage: if True, use directmode for the memory
        """

        self.name = name
        net = nef.Network(self, seed=HRLutils.SEED, quick=False)
        self.dimension = d
        self.radius = radius

        tauPSC = 0.007
        intPSC = 0.1

        # population that will store the value
        if not direct_storage:
            storage = net.make_array("storage", N, d,
                                     node_factory=HRLutils.node_fac(),
                                     eval_points=[[x * 0.001]
                                                  for x in range(-1000, 1000)])
        else:
            storage = net.make("storage", 1, d, mode="direct")
            storage.fixMode()

        net.connect(storage, storage, transform=MU.diag([recurweight
                                                         for _ in range(d)]),
                    pstc=intPSC)

        # storageinput will represent (target - stored_value), which when used
        # as input to storage will drive the stored value to target
        storageinput = net.make_array("storageinput", N, d,
                                      node_factory=HRLutils.node_fac())
        storageinput.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

        storageinput.addDecodedTermination("target",
                                           MU.diag([1.0 / radius
                                                    for _ in range(d)]),
                                           tauPSC, False)
        # note: store everything in -1 -- 1 range by dividing by radius

        # scale storageinput value by inputscale to control rate at which
        # it moves to the target
        net.connect(storageinput, storage, pstc=intPSC,
                    transform=MU.diag([inputscale * intPSC for _ in range(d)]))

        # subtract currently stored value
        net.connect(storage, storageinput, pstc=tauPSC,
                    transform=MU.diag([-1 for _ in range(d)]))

        # we want to open the input gate when the transfer signal arrives (to
        # transfer storageinput to storage). using a double inhibition setup
        # (rather than just feeding it e.g. the the inverse of the transfer
        # signal) so that we get a nice clean zero

        # this inhibits the storageinput population (to block input to the
        # storage)
        transferinhib = net.make("transferinhib", N, 1,
                                 node_factory=HRLutils.node_fac())
        transferinhib.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

        transferinhib.addTermination("gate",
                                     [[-10] for _ in
                                      range(transferinhib.getNeurons())],
                                     tauPSC, False)

        net.connect(transferinhib, storageinput, pstc=tauPSC,
                    transform=[[-10] for _ in
                               range(storageinput.getNeurons())])

        # this drives the transferinhib population (so that by default it will
        # block any input). inhibiting transferinhib will thus remove the
        # inhibition on storageinput, and change the stored value
        biasinput = net.make_input("biasinput", [1])

        net.connect(biasinput, transferinhib, pstc=tauPSC)

        # output population (to undo radius scaling)
        storageoutput = net.make("storageoutput", 1, d, mode="direct")
        storageoutput.fixMode()
        net.connect(storage, storageoutput, pstc=0.001,
                    transform=MU.diag([radius for _ in range(d)]))

        self.exposeTermination(transferinhib.getTermination("gate"),
                               "transfer")
        self.exposeTermination(storageinput.getTermination("target"), "target")
        self.exposeOrigin(storageoutput.getOrigin("X"), "X")
예제 #3
0
def run_badreenvironment(nav_args,
                         ctrl_args,
                         bias=0.0,
                         seed=None,
                         flat=False,
                         label="tmp"):
    """Runs the model on the Badre et al. (2010) task."""

    if seed is not None:
        HRLutils.set_seed(seed)
    seed = HRLutils.SEED

    net = nef.Network("run_badreenvironment")

    env = badreenvironment.BadreEnvironment(flat=flat)
    net.add(env)

    # ##NAV AGENT
    stateN = 500
    max_state_input = 3
    enc = env.gen_encoders(stateN, 0, 0.0)

    # generate evaluation points
    orientations = MU.I(env.num_orientations)
    shapes = MU.I(env.num_shapes)
    colours = MU.I(env.num_colours)
    evals = (
        list(MU.diag([3 for _ in range(env.stateD)])) +
        [o + s + c for o in orientations for s in shapes for c in colours])

    # create lower level
    nav_agent = smdpagent.SMDPAgent(stateN,
                                    env.stateD,
                                    env.actions,
                                    name="NavAgent",
                                    stateradius=max_state_input,
                                    state_encoders=enc,
                                    state_evals=evals,
                                    discount=0.5,
                                    **nav_args)
    net.add(nav_agent)

    print "agent neurons:", nav_agent.countNeurons()

    # actions terminate on fixed schedule (aligned with environment)
    nav_term_node = terminationnode.TerminationNode(
        {terminationnode.Timer((0.6, 0.6)): None},
        env,
        name="NavTermNode",
        state_delay=0.1,
        reset_delay=0.05,
        reset_interval=0.1)
    net.add(nav_term_node)

    net.connect(nav_term_node.getOrigin("reset"),
                nav_agent.getTermination("reset"))
    net.connect(nav_term_node.getOrigin("learn"),
                nav_agent.getTermination("learn"))
    net.connect(nav_term_node.getOrigin("reset"),
                nav_agent.getTermination("save_state"))
    net.connect(nav_term_node.getOrigin("reset"),
                nav_agent.getTermination("save_action"))

    net.connect(nav_agent.getOrigin("action_output"),
                env.getTermination("action"))

    # ##CTRL AGENT
    stateN = 500
    enc = RandomHypersphereVG().genVectors(stateN, env.stateD)
    actions = [("shape", [0, 1]), ("orientation", [1, 0]), ("null", [0, 0])]
    ctrl_agent = smdpagent.SMDPAgent(stateN,
                                     env.stateD,
                                     actions,
                                     name="CtrlAgent",
                                     state_encoders=enc,
                                     stateradius=max_state_input,
                                     state_evals=evals,
                                     discount=0.4,
                                     **ctrl_args)
    net.add(ctrl_agent)

    print "agent neurons:", ctrl_agent.countNeurons()

    net.connect(env.getOrigin("state"),
                ctrl_agent.getTermination("state_input"))

    ctrl_term_node = terminationnode.TerminationNode(
        {terminationnode.Timer((0.6, 0.6)): None},
        env,
        name="CtrlTermNode",
        state_delay=0.1,
        reset_delay=0.05,
        reset_interval=0.1)
    net.add(ctrl_term_node)

    net.connect(ctrl_term_node.getOrigin("reset"),
                ctrl_agent.getTermination("reset"))
    net.connect(ctrl_term_node.getOrigin("learn"),
                ctrl_agent.getTermination("learn"))
    net.connect(ctrl_term_node.getOrigin("reset"),
                ctrl_agent.getTermination("save_state"))
    net.connect(ctrl_term_node.getOrigin("reset"),
                ctrl_agent.getTermination("save_action"))

    # ctrl gets a slight bonus if it selects a rule (as opposed to null), to
    # encourage it to not just pick null all the time
    reward_relay = net.make("reward_relay", 1, 3, mode="direct")
    reward_relay.fixMode()
    net.connect(env.getOrigin("reward"),
                reward_relay,
                transform=[[1], [0], [0]])
    net.connect(ctrl_agent.getOrigin("action_output"),
                reward_relay,
                transform=[[0, 0], [1, 0], [0, 1]])

    net.connect(reward_relay,
                ctrl_agent.getTermination("reward"),
                func=lambda x: ((x[0] + bias * abs(x[0]))
                                if x[1] + x[2] > 0.5 else x[0]),
                origin_name="ctrl_reward")

    # ideal reward function (for testing)
    #     def ctrl_reward_func(x):
    #         if abs(x[0]) < 0.5:
    #             return 0.0
    #
    #         if flat:
    #             return 1.5 if x[1] + x[2] < 0.5 else -1.5
    #         else:
    #             if x[1] + x[2] < 0.5:
    #                 return -1.5
    #             if [round(a) for a in env.state[-2:]] == [round(b)
    #                                                       for b in x[1:]]:
    #                 return 1.5
    #             else:
    #                 return -1.5
    #     net.connect(reward_relay, ctrl_agent.getTermination("reward"),
    #                 func=ctrl_reward_func)

    # nav rewarded for picking ctrl target
    def nav_reward_func(x):
        if abs(x[0]) < 0.5 or env.action is None:
            return 0.0

        if x[1] + x[2] < 0.5:
            return x[0]

        if x[1] > x[2]:
            return (1.5 if env.action[1] == env.state[:env.num_orientations]
                    else -1.5)
        else:
            return (1.5 if env.action[1]
                    == env.state[env.num_orientations:-env.num_colours] else
                    -1.5)

    net.connect(reward_relay,
                nav_agent.getTermination("reward"),
                func=nav_reward_func)

    # state for navagent controlled by ctrlagent
    ctrl_state_inhib = net.make_array("ctrl_state_inhib",
                                      50,
                                      env.stateD,
                                      radius=2,
                                      mode=HRLutils.SIMULATION_MODE)
    ctrl_state_inhib.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

    inhib_matrix = [[0, -5]] * 50 * env.num_orientations + \
                   [[-5, 0]] * 50 * env.num_shapes + \
                   [[-5, -5]] * 50 * env.num_colours

    # ctrl output inhibits all the non-selected aspects of the state
    net.connect(env.getOrigin("state"), ctrl_state_inhib)
    net.connect(ctrl_agent.getOrigin("action_output"),
                ctrl_state_inhib,
                transform=inhib_matrix)

    # also give a boost to the selected aspects (so that neurons are roughly
    # equally activated).
    def boost_func(x):
        if x[0] > 0.5:
            return [3 * v for v in x[1:]]
        else:
            return x[1:]

    boost = net.make("boost", 1, 1 + env.stateD, mode="direct")
    boost.fixMode()
    net.connect(ctrl_state_inhib,
                boost,
                transform=([[0 for _ in range(env.stateD)]] +
                           list(MU.I(env.stateD))))
    net.connect(ctrl_agent.getOrigin("action_output"),
                boost,
                transform=[[1, 1]] + [[0, 0] for _ in range(env.stateD)])

    net.connect(boost,
                nav_agent.getTermination("state_input"),
                func=boost_func)

    # save weights
    weight_save = 1.0  # period to save weights (realtime, not simulation time)
    threads = [
        HRLutils.WeightSaveThread(
            nav_agent.getNode("QNetwork").saveParams,
            os.path.join("weights", "%s_%s" % (nav_agent.name, seed)),
            weight_save),
        HRLutils.WeightSaveThread(
            ctrl_agent.getNode("QNetwork").saveParams,
            os.path.join("weights", "%s_%s" % (ctrl_agent.name, seed)),
            weight_save)
    ]
    for t in threads:
        t.start()

    # data collection node
    data = datanode.DataNode(
        period=1,
        filename=HRLutils.datafile("dataoutput_%s.txt" % label),
        header="%s %s %s %s %s" % (nav_args, ctrl_args, bias, seed, flat))
    print "saving data to", data.filename
    print "header", data.header
    net.add(data)
    nav_q = nav_agent.getNode("QNetwork")
    ctrl_q = ctrl_agent.getNode("QNetwork")
    ctrl_bg = ctrl_agent.getNode("BGNetwork").getNode("weight_actions")
    data.record_avg(env.getOrigin("reward"))
    data.record_avg(ctrl_q.getNode("actionvals").getOrigin("X"))
    data.record_sparsity(ctrl_q.getNode("state_pop").getOrigin("AXON"))
    data.record_sparsity(nav_q.getNode("state_pop").getOrigin("AXON"))
    data.record_avg(ctrl_q.getNode("valdiff").getOrigin("X"))
    data.record_avg(ctrl_agent.getNode("ErrorNetwork").getOrigin("error"))
    data.record_avg(ctrl_bg.getNode("0").getOrigin("AXON"))
    data.record_avg(ctrl_bg.getNode("1").getOrigin("AXON"))
    data.record(env.getOrigin("score"))

    #     net.add_to_nengo()
    #     net.network.simulator.run(0, 300, 0.001)
    net.view()

    for t in threads:
        t.stop()
예제 #4
0
파일: run.py 프로젝트: drasmuss/nhrlmodel
def run_badreenvironment(nav_args, ctrl_args, bias=0.0, seed=None, flat=False,
                         label="tmp"):
    """Runs the model on the Badre et al. (2010) task."""

    if seed is not None:
        HRLutils.set_seed(seed)
    seed = HRLutils.SEED

    net = nef.Network("run_badreenvironment")

    env = badreenvironment.BadreEnvironment(flat=flat)
    net.add(env)

    # ##NAV AGENT
    stateN = 500
    max_state_input = 3
    enc = env.gen_encoders(stateN, 0, 0.0)

    # generate evaluation points
    orientations = MU.I(env.num_orientations)
    shapes = MU.I(env.num_shapes)
    colours = MU.I(env.num_colours)
    evals = (list(MU.diag([3 for _ in range(env.stateD)])) +
             [o + s + c
              for o in orientations for s in shapes for c in colours])

    # create lower level
    nav_agent = smdpagent.SMDPAgent(stateN, env.stateD, env.actions,
                                    name="NavAgent",
                                    stateradius=max_state_input,
                                    state_encoders=enc, state_evals=evals,
                                    discount=0.5, **nav_args)
    net.add(nav_agent)

    print "agent neurons:", nav_agent.countNeurons()

    # actions terminate on fixed schedule (aligned with environment)
    nav_term_node = terminationnode.TerminationNode(
        {terminationnode.Timer((0.6, 0.6)): None}, env, name="NavTermNode",
        state_delay=0.1, reset_delay=0.05, reset_interval=0.1)
    net.add(nav_term_node)

    net.connect(nav_term_node.getOrigin("reset"),
                nav_agent.getTermination("reset"))
    net.connect(nav_term_node.getOrigin("learn"),
                nav_agent.getTermination("learn"))
    net.connect(nav_term_node.getOrigin("reset"),
                nav_agent.getTermination("save_state"))
    net.connect(nav_term_node.getOrigin("reset"),
                nav_agent.getTermination("save_action"))

    net.connect(nav_agent.getOrigin("action_output"),
                env.getTermination("action"))

    # ##CTRL AGENT
    stateN = 500
    enc = RandomHypersphereVG().genVectors(stateN, env.stateD)
    actions = [("shape", [0, 1]), ("orientation", [1, 0]), ("null", [0, 0])]
    ctrl_agent = smdpagent.SMDPAgent(stateN, env.stateD, actions,
                                     name="CtrlAgent", state_encoders=enc,
                                     stateradius=max_state_input,
                                     state_evals=evals, discount=0.4,
                                     **ctrl_args)
    net.add(ctrl_agent)

    print "agent neurons:", ctrl_agent.countNeurons()

    net.connect(env.getOrigin("state"),
                ctrl_agent.getTermination("state_input"))

    ctrl_term_node = terminationnode.TerminationNode(
        {terminationnode.Timer((0.6, 0.6)): None}, env, name="CtrlTermNode",
        state_delay=0.1, reset_delay=0.05, reset_interval=0.1)
    net.add(ctrl_term_node)

    net.connect(ctrl_term_node.getOrigin("reset"),
                ctrl_agent.getTermination("reset"))
    net.connect(ctrl_term_node.getOrigin("learn"),
                ctrl_agent.getTermination("learn"))
    net.connect(ctrl_term_node.getOrigin("reset"),
                ctrl_agent.getTermination("save_state"))
    net.connect(ctrl_term_node.getOrigin("reset"),
                ctrl_agent.getTermination("save_action"))

    # ctrl gets a slight bonus if it selects a rule (as opposed to null), to
    # encourage it to not just pick null all the time
    reward_relay = net.make("reward_relay", 1, 3, mode="direct")
    reward_relay.fixMode()
    net.connect(env.getOrigin("reward"), reward_relay,
                transform=[[1], [0], [0]])
    net.connect(ctrl_agent.getOrigin("action_output"), reward_relay,
                transform=[[0, 0], [1, 0], [0, 1]])

    net.connect(reward_relay, ctrl_agent.getTermination("reward"),
                func=lambda x: ((x[0] + bias * abs(x[0]))
                                if x[1] + x[2] > 0.5 else x[0]),
                origin_name="ctrl_reward")

    # ideal reward function (for testing)
#     def ctrl_reward_func(x):
#         if abs(x[0]) < 0.5:
#             return 0.0
#
#         if flat:
#             return 1.5 if x[1] + x[2] < 0.5 else -1.5
#         else:
#             if x[1] + x[2] < 0.5:
#                 return -1.5
#             if [round(a) for a in env.state[-2:]] == [round(b)
#                                                       for b in x[1:]]:
#                 return 1.5
#             else:
#                 return -1.5
#     net.connect(reward_relay, ctrl_agent.getTermination("reward"),
#                 func=ctrl_reward_func)

    # nav rewarded for picking ctrl target
    def nav_reward_func(x):
        if abs(x[0]) < 0.5 or env.action is None:
            return 0.0

        if x[1] + x[2] < 0.5:
            return x[0]

        if x[1] > x[2]:
            return (1.5 if env.action[1] == env.state[:env.num_orientations]
                    else -1.5)
        else:
            return (1.5 if env.action[1] == env.state[env.num_orientations:
                                                      - env.num_colours]
                    else -1.5)
    net.connect(reward_relay, nav_agent.getTermination("reward"),
                func=nav_reward_func)

    # state for navagent controlled by ctrlagent
    ctrl_state_inhib = net.make_array("ctrl_state_inhib", 50, env.stateD,
                                      radius=2, mode=HRLutils.SIMULATION_MODE)
    ctrl_state_inhib.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

    inhib_matrix = [[0, -5]] * 50 * env.num_orientations + \
                   [[-5, 0]] * 50 * env.num_shapes + \
                   [[-5, -5]] * 50 * env.num_colours

    # ctrl output inhibits all the non-selected aspects of the state
    net.connect(env.getOrigin("state"), ctrl_state_inhib)
    net.connect(ctrl_agent.getOrigin("action_output"), ctrl_state_inhib,
                transform=inhib_matrix)

    # also give a boost to the selected aspects (so that neurons are roughly
    # equally activated).
    def boost_func(x):
        if x[0] > 0.5:
            return [3 * v for v in x[1:]]
        else:
            return x[1:]
    boost = net.make("boost", 1, 1 + env.stateD, mode="direct")
    boost.fixMode()
    net.connect(ctrl_state_inhib, boost,
                transform=([[0 for _ in range(env.stateD)]] +
                           list(MU.I(env.stateD))))
    net.connect(ctrl_agent.getOrigin("action_output"), boost,
                transform=[[1, 1]] + [[0, 0] for _ in range(env.stateD)])

    net.connect(boost, nav_agent.getTermination("state_input"),
                func=boost_func)

    # save weights
    weight_save = 1.0  # period to save weights (realtime, not simulation time)
    threads = [
        HRLutils.WeightSaveThread(nav_agent.getNode("QNetwork").saveParams,
                                  os.path.join("weights", "%s_%s" %
                                               (nav_agent.name, seed)),
                                  weight_save),
        HRLutils.WeightSaveThread(ctrl_agent.getNode("QNetwork").saveParams,
                                  os.path.join("weights", "%s_%s" %
                                               (ctrl_agent.name, seed)),
                                  weight_save)]
    for t in threads:
        t.start()

    # data collection node
    data = datanode.DataNode(period=1,
                             filename=HRLutils.datafile("dataoutput_%s.txt" %
                                                        label),
                             header="%s %s %s %s %s" % (nav_args, ctrl_args,
                                                        bias, seed, flat))
    print "saving data to", data.filename
    print "header", data.header
    net.add(data)
    nav_q = nav_agent.getNode("QNetwork")
    ctrl_q = ctrl_agent.getNode("QNetwork")
    ctrl_bg = ctrl_agent.getNode("BGNetwork").getNode("weight_actions")
    data.record_avg(env.getOrigin("reward"))
    data.record_avg(ctrl_q.getNode("actionvals").getOrigin("X"))
    data.record_sparsity(ctrl_q.getNode("state_pop").getOrigin("AXON"))
    data.record_sparsity(nav_q.getNode("state_pop").getOrigin("AXON"))
    data.record_avg(ctrl_q.getNode("valdiff").getOrigin("X"))
    data.record_avg(ctrl_agent.getNode("ErrorNetwork").getOrigin("error"))
    data.record_avg(ctrl_bg.getNode("0").getOrigin("AXON"))
    data.record_avg(ctrl_bg.getNode("1").getOrigin("AXON"))
    data.record(env.getOrigin("score"))

#     net.add_to_nengo()
#     net.network.simulator.run(0, 300, 0.001)
    net.view()

    for t in threads:
        t.stop()
예제 #5
0
    def __init__(self, actions, Qradius=1, noiselevel=0.03):
        """Builds the BGNetwork.

        :param actions: actions available to the system
            :type actions: list of tuples (action_name,action_vector)
        :param Qradius: expected radius of Q values
        :param noiselevel: standard deviation of noise added to Q values for
            exploration
        """

        self.name = "BGNetwork"
        net = nef.Network(self, seed=HRLutils.SEED, quick=False)

        self.N = 50
        self.d = len(actions)
        self.mut_inhib = 1.0  # mutual inhibition between actions
        self.tauPSC = 0.007

        # make basal ganglia
        netbg = nef.Network("bg")

        bginput = netbg.make("bginput", 1, self.d, mode="direct")
        bginput.fixMode()
        bginput.addDecodedTermination("input",
                                      MU.diag([1.0 / Qradius for _ in
                                               range(self.d)]), 0.001, False)
        # divide by Q radius to get values back into 0 -- 1 range

        bgoutput = netbg.make("bgoutput", 1, self.d, mode="direct")
        bgoutput.fixMode()

        basalganglia.make_basal_ganglia(netbg, bginput, bgoutput,
                                        dimensions=self.d, neurons=200)
        bg = netbg.network
        net.add(bg)
        bg.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

        bg.exposeTermination(bginput.getTermination("input"), "input")
        bg.exposeOrigin(bgoutput.getOrigin("X"), "X")

        # insert noise (used to give some randomness to drive exploration)
        noiselevel = net.make_input("noiselevel", [noiselevel])

        noise = noisenode.NoiseNode(1, dimension=len(actions))
        net.add(noise)

        net.connect(noiselevel, noise.getTermination("scale"))
        net.connect(noise.getOrigin("noise"), "bg.bginput", pstc=0.001)

        # add bias to shift everything up to 0.5--1.5
        biasinput = net.make_input("biasinput", [0.5])
        net.connect(biasinput, "bg.bginput",
                    transform=[[1] for _ in range(self.d)], pstc=0.001)

        # invert BG output (so the "selected" action will have a positive value
        # and the rest zero)
        invert = thalamus.make(net, name="invert", neurons=self.N,
                               dimensions=self.d, useQuick=False)
        invert.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])
        net.connect(bg, invert.getTermination("bg_input"))

        # add mutual inhibition
        net.connect(invert.getOrigin("xBiased"), invert, pstc=self.tauPSC,
                    transform=[[0 if i == j else -self.mut_inhib
                                for j in range(self.d)]
                               for i in range(self.d)])

        # threshold output values so that you get a nice clean 0 for
        # non-selected and 1 for selected
        threshf = HRLutils.node_fac()
        threshold = 0.1
        threshf.setIntercept(IndicatorPDF(threshold, 1.0))
        val_threshold = net.make_array("val_threshold", self.N * 2, self.d,
                                       node_factory=threshf, encoders=[[1]])
        val_threshold.addDecodedOrigin(
            "output",
            [PiecewiseConstantFunction([threshold], [0, 1])
             for _ in range(self.d)], "AXON", True)

        net.connect(invert.getOrigin("xBiased"), val_threshold,
                    pstc=self.tauPSC)

        # output action (action vectors weighted by BG output)
        weight_actions = net.make_array("weight_actions", 50,
                                        len(actions[0][1]), intercept=(0, 1))
        net.connect(val_threshold.getOrigin("output"), weight_actions,
                    transform=MU.transpose([actions[i][1]
                                            for i in range(self.d)]),
                    pstc=0.007)

        # save the BG output (selected action and selected action value)
        save_relay = net.make("save_relay", 1, 1, mode="direct")
        save_relay.fixMode()
        save_relay.addDecodedTermination("input", [[1]], 0.001, False)

        saved_action = memory.Memory("saved_action", self.N * 2,
                                     len(actions[0][1]), inputscale=75)
        net.add(saved_action)
        net.connect(weight_actions, saved_action.getTermination("target"))
        net.connect(save_relay, saved_action.getTermination("transfer"))

        saved_vals = memory.Memory("saved_values", self.N * 2, self.d,
                                   inputscale=75)
        net.add(saved_vals)
        net.connect(val_threshold.getOrigin("output"),
                    saved_vals.getTermination("target"))
        net.connect(save_relay, saved_vals.getTermination("transfer"))

        # put the saved values through a threshold (we want a nice clean
        # zero for non-selected values)
        nfac = HRLutils.node_fac()
        nfac.setIntercept(IndicatorPDF(0.2, 1))
        saved_vals_threshold = net.make_array("saved_vals_threshold", self.N,
                                              self.d, node_factory=nfac,
                                              encoders=[[1]])
        saved_vals_threshold.addDecodedOrigin(
            "output", [PiecewiseConstantFunction([0.3], [0, 1])
                       for _ in range(self.d)], "AXON", True)

        net.connect(saved_vals, saved_vals_threshold, pstc=self.tauPSC)

        self.exposeTermination(bg.getTermination("input"), "input")
        self.exposeTermination(save_relay.getTermination("input"),
                               "save_output")
        self.exposeOrigin(val_threshold.getOrigin("output"), "curr_vals")
        self.exposeOrigin(weight_actions.getOrigin("X"), "curr_action")
        self.exposeOrigin(saved_vals_threshold.getOrigin("output"),
                          "saved_vals")
        self.exposeOrigin(saved_action.getOrigin("X"), "saved_action")
예제 #6
0
    def __init__(self, stateN, stateD, state_encoders, actions, learningrate,
                stateradius=1.0, Qradius=1.0,
                load_weights=None, state_evals=None, state_threshold=0.0):
        """Builds the QNetwork.

        :param stateN: number of neurons to use to represent state
        :param stateD: dimension of state vector
        :param state_encoders: encoders to use for neurons in state population
        :param actions: actions available to the system
            :type actions: list of tuples (action_name,action_vector)
        :param learningrate: learningrate for action value learning rule
        :param stateradius: expected radius of state values
        :param Qradius: expected radius of Q values
        :param load_weights: filename to load Q value weights from
        :param state_evals: evaluation points to use for state population.
            This is used when initializing the Q values (may be necessary if the
            input states don't tend to fall in the hypersphere).
        :param state_threshold: threshold of state neurons (minimum intercept)
        """

        self.name = "QNetwork"
        net = nef.Network(self, seed=HRLutils.SEED, quick=False)

        N = 50
        tauPSC = 0.007
        num_actions = len(actions)
        init_Qs = 0.2 #initial value for all Q values
        self.neuron_learning = False
        # if True, use neuron--neuron weight learning,
        # otherwise, use decoder learning

        # set up relays
        state_relay = net.make("state_relay", 1, stateD, mode="direct")
        state_relay.fixMode() # This apparently fixes the simulator mode to the curremt mode, so I'm guessing we just don't want it over-ridden by an over-zealous config file.
        state_relay.addDecodedTermination("input", MU.I(stateD), 0.001, False)

        # create state population
        state_fac = HRLutils.node_fac()
        state_fac.setIntercept(IndicatorPDF(state_threshold, 1.0))

        print("making the state_pop")
        state_pop = net.make("state_pop", stateN, stateD,
                              radius=stateradius,
                              node_factory=state_fac,
                              encoders=state_encoders,
                              eval_points=state_evals)
        state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

        net.connect(state_relay, state_pop, pstc=tauPSC)

        # store the state value (used to drive population encoding previous state)
        print("create the saved state memory")
        saved_state = memory.Memory("saved_state", N * 4, stateD, inputscale=50, radius=stateradius,
                                    direct_storage=True)
        net.add(saved_state)

        net.connect(state_relay, saved_state.getTermination("target"))

        # create population representing previous state
        old_state_pop = net.make("old_state_pop", stateN, stateD,
                              radius=stateradius,
                              node_factory=state_fac,
                              encoders=state_encoders,
                              eval_points=state_evals)
        old_state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

        net.connect(saved_state, old_state_pop, pstc=tauPSC)

        print("setup the action nodes")
        # set up action nodes
        if self.neuron_learning:
            # use ActionValues network to compute Q values

            # current Q values
            decoders = state_pop.addDecodedOrigin("init_decoders",
                                                  [ConstantFunction(stateD, init_Qs)], "AXON").getDecoders()
            actionvals = actionvalues.ActionValues("actionvals", N, stateN, actions, learningrate,
                                                   Qradius=Qradius, init_decoders=decoders)
            net.add(actionvals)

            net.connect(state_pop.getOrigin("AXON"), actionvals.getTermination("state"))

            # Q values of previous state
            decoders = old_state_pop.addDecodedOrigin("init_decoders",
                                                      [ConstantFunction(stateD, init_Qs)], "AXON").getDecoders()
            old_actionvals = actionvalues.ActionValues("old_actionvals", N, stateN, actions, learningrate,
                                                       Qradius=Qradius, init_decoders=decoders)
            net.add(old_actionvals)

            net.connect(old_state_pop.getOrigin("AXON"), old_actionvals.getTermination("state"))
        else:
            # just use decoder on state population to compute Q values

            # current Q values
            origin = state_pop.addDecodedOrigin("vals",
                                        [ConstantFunction(num_actions, init_Qs) for _ in range(num_actions)],
                                        "AXON")
            state_dlnode = decoderlearningnode.DecoderLearningNode(state_pop, origin, learningrate,
                                                                   num_actions, name="state_learningnode")
            net.add(state_dlnode)

            # just a little relay node, so that things match up for the rest of the script 
            # when you have the neuron -- neuron learning
            actionvals = net.make("actionvals", 1, num_actions, mode="direct")
            actionvals.fixMode()
            net.connect(origin, actionvals, pstc=0.001)

            # Q values of previous state
            origin = old_state_pop.addDecodedOrigin("vals",
                                        [ConstantFunction(num_actions, init_Qs) for _ in range(num_actions)],
                                        "AXON")
            old_state_dlnode = decoderlearningnode.DecoderLearningNode(old_state_pop, origin, learningrate, num_actions, name="old_state_learningnode")
            net.add(old_state_dlnode)

            old_actionvals = net.make("old_actionvals", 1, num_actions, mode="direct")
            old_actionvals.fixMode()
            net.connect(origin, old_actionvals, pstc=0.001)

        if load_weights != None:
            self.loadParams(load_weights)

        # find error between old_actionvals and actionvals (this will be used to drive learning
        # on the new actionvals)
        valdiff = net.make_array("valdiff", N, num_actions, node_factory=HRLutils.node_fac())
        net.connect(old_actionvals, valdiff, transform=MU.diag([2] * num_actions), pstc=tauPSC)
        net.connect(actionvals, valdiff, transform=MU.diag([-2] * num_actions), pstc=tauPSC)
            # doubling the values to get a bigger error signal

        # calculate diff between curr_state and saved_state and use that to gate valdiff (we
        # only want to train the curr state based on previous state when the two have similar
        # values) # WTF does that mean and what is with these weird intercept
        statediff = net.make_array("statediff", N, stateD, intercept=(0.2, 1))
            # note: threshold > 0 so that there is a deadzone in the middle (when the states
            # are similar) where there will be no output inhibition
        net.connect(state_relay, statediff, pstc=tauPSC)
        net.connect(saved_state, statediff, transform=MU.diag([-1] * stateD), pstc=tauPSC)

        net.connect(statediff, valdiff, func=lambda x: [abs(v) for v in x],
                    transform=[[-10] * stateD for _ in range(valdiff.getNeurons())], pstc=tauPSC)

        # connect up valdiff to the error signal for current Q values, and expose
        # the error signal for the previous Q values to the external error
        if self.neuron_learning:
            net.connect(valdiff, actionvals.getTermination("error"))
            self.exposeTermination(old_actionvals.getTermination("error"), "error")
        else:
            net.connect(valdiff, state_dlnode.getTermination("error"))
            self.exposeTermination(old_state_dlnode.getTermination("error"), "error")

        self.exposeTermination(state_relay.getTermination("input"), "state")
        self.exposeTermination(saved_state.getTermination("transfer"), "save_state")
        self.exposeOrigin(actionvals.getOrigin("X"), "vals")
        self.exposeOrigin(old_actionvals.getOrigin("X"), "old_vals")
예제 #7
0
    def __init__(self,
                 stateN,
                 stateD,
                 state_encoders,
                 actions,
                 learningrate,
                 stateradius=1.0,
                 Qradius=1.0,
                 load_weights=None,
                 state_evals=None,
                 state_threshold=(0.0, 1.0),
                 statediff_threshold=0.2,
                 init_Qs=None):
        """Builds the QNetwork.

        :param stateN: number of neurons to use to represent state
        :param stateD: dimension of state vector
        :param state_encoders: encoders to use for neurons in state population
        :param actions: actions available to the system
            :type actions: list of tuples (action_name,action_vector)
        :param learningrate: learningrate for action value learning rule
        :param stateradius: expected radius of state values
        :param Qradius: expected radius of Q values
        :param load_weights: filename to load Q value weights from
        :param state_evals: evaluation points to use for state population.
            This is used when initializing the Q values (may be necessary if
            the input states don't tend to fall in the hypersphere).
        :param state_threshold: threshold range of state neurons
        :param statediff_threshold: maximum state difference for dual training
        :param init_Qs: initial Q values
        """

        self.name = "QNetwork"
        net = nef.Network(self, seed=HRLutils.SEED, quick=False)

        N = 50
        tauPSC = 0.007
        num_actions = len(actions)
        init_Qs = [0.2] * num_actions if init_Qs is None else init_Qs

        # if True, use neuron--neuron weight learning, otherwise, use decoder
        # learning
        self.neuron_learning = False

        # set up relays
        state_relay = net.make("state_relay", 1, stateD, mode="direct")
        state_relay.fixMode()
        state_relay.addDecodedTermination("input", MU.I(stateD), 0.001, False)

        # create state population
        state_fac = HRLutils.node_fac()
        if isinstance(state_threshold, (float, int)):
            state_threshold = (state_threshold, 1.0)
        state_fac.setIntercept(
            IndicatorPDF(state_threshold[0], state_threshold[1]))

        state_pop = net.make("state_pop",
                             stateN,
                             stateD,
                             radius=stateradius,
                             node_factory=state_fac,
                             encoders=state_encoders,
                             eval_points=state_evals)
        state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

        net.connect(state_relay, state_pop, pstc=tauPSC)

        # store the state value (used to drive population encoding previous
        # state)
        saved_state = memory.Memory("saved_state",
                                    N * 4,
                                    stateD,
                                    inputscale=50,
                                    radius=stateradius,
                                    direct_storage=True)
        net.add(saved_state)

        net.connect(state_relay, saved_state.getTermination("target"))

        # create population representing previous state
        old_state_pop = net.make("old_state_pop",
                                 stateN,
                                 stateD,
                                 radius=stateradius,
                                 node_factory=state_fac,
                                 encoders=state_encoders,
                                 eval_points=state_evals)
        old_state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

        net.connect(saved_state, old_state_pop, pstc=tauPSC)

        # set up action nodes
        if self.neuron_learning:
            # use ActionValues network to compute Q values

            # current Q values
            decoders = state_pop.addDecodedOrigin(
                "init_decoders", [ConstantFunction(stateD, init_Qs)],
                "AXON").getDecoders()
            actionvals = actionvalues.ActionValues("actionvals",
                                                   N,
                                                   stateN,
                                                   actions,
                                                   learningrate,
                                                   Qradius=Qradius,
                                                   init_decoders=decoders)
            net.add(actionvals)

            net.connect(state_pop.getOrigin("AXON"),
                        actionvals.getTermination("state"))

            # Q values of previous state
            decoders = old_state_pop.addDecodedOrigin(
                "init_decoders", [ConstantFunction(stateD, init_Qs)],
                "AXON").getDecoders()
            old_actionvals = actionvalues.ActionValues("old_actionvals",
                                                       N,
                                                       stateN,
                                                       actions,
                                                       learningrate,
                                                       Qradius=Qradius,
                                                       init_decoders=decoders)
            net.add(old_actionvals)

            net.connect(old_state_pop.getOrigin("AXON"),
                        old_actionvals.getTermination("state"))
        else:
            # just use decoder on state population to compute Q values

            # current Q values
            origin = state_pop.addDecodedOrigin("vals", [
                ConstantFunction(num_actions, init_Qs[i])
                for i in range(num_actions)
            ], "AXON")
            state_dlnode = decoderlearningnode.DecoderLearningNode(
                state_pop,
                origin,
                learningrate,
                num_actions,
                name="state_learningnode")
            net.add(state_dlnode)

            # just a little relay node, so that things match up for the rest of
            # the script when you have the neuron -- neuron learning
            actionvals = net.make("actionvals", 1, num_actions, mode="direct")
            actionvals.fixMode()
            net.connect(origin, actionvals, pstc=0.001)

            # Q values of previous state
            origin = old_state_pop.addDecodedOrigin("vals", [
                ConstantFunction(num_actions, init_Qs[i])
                for i in range(num_actions)
            ], "AXON")
            old_state_dlnode = decoderlearningnode.DecoderLearningNode(
                old_state_pop,
                origin,
                learningrate,
                num_actions,
                name="old_state_learningnode")
            net.add(old_state_dlnode)

            old_actionvals = net.make("old_actionvals",
                                      1,
                                      num_actions,
                                      mode="direct")
            old_actionvals.fixMode()
            net.connect(origin, old_actionvals, pstc=0.001)

        if load_weights is not None:
            self.loadParams(load_weights)

        # find error between old_actionvals and actionvals (this will be used
        # to drive learning on the new actionvals)
        valdiff = net.make_array("valdiff",
                                 N,
                                 num_actions,
                                 node_factory=HRLutils.node_fac())
        # doubling the values to get a bigger error signal
        net.connect(old_actionvals,
                    valdiff,
                    transform=MU.diag([2] * num_actions),
                    pstc=tauPSC)
        net.connect(actionvals,
                    valdiff,
                    transform=MU.diag([-2] * num_actions),
                    pstc=tauPSC)

        # calculate diff between curr_state and saved_state and use that to
        # gate valdiff (we only want to train the curr state based on previous
        # state when the two have similar values)
        # note: threshold > 0 so that there is a deadzone in the middle (when
        # the states are similar) where there will be no output inhibition
        statediff = net.make_array("statediff",
                                   N,
                                   stateD,
                                   intercept=(statediff_threshold, 1))

        net.connect(state_relay, statediff, pstc=tauPSC)
        net.connect(saved_state,
                    statediff,
                    transform=MU.diag([-1] * stateD),
                    pstc=tauPSC)

        net.connect(statediff,
                    valdiff,
                    func=lambda x: [abs(v) for v in x],
                    transform=[[-10] * stateD
                               for _ in range(valdiff.getNeurons())],
                    pstc=tauPSC)

        # connect up valdiff to the error signal for current Q values, and
        # expose the error signal for the previous Q values to the external
        # error
        if self.neuron_learning:
            net.connect(valdiff, actionvals.getTermination("error"))
            self.exposeTermination(old_actionvals.getTermination("error"),
                                   "error")
        else:
            net.connect(valdiff, state_dlnode.getTermination("error"))
            self.exposeTermination(old_state_dlnode.getTermination("error"),
                                   "error")

        self.exposeTermination(state_relay.getTermination("input"), "state")
        self.exposeTermination(saved_state.getTermination("transfer"),
                               "save_state")
        self.exposeOrigin(actionvals.getOrigin("X"), "vals")
        self.exposeOrigin(old_actionvals.getOrigin("X"), "old_vals")
예제 #8
0
    def __init__(self, stateN, stateD, state_encoders, actions, learningrate,
                 stateradius=1.0, Qradius=1.0, load_weights=None):
        NetworkImpl.__init__(self)
        self.name = "QNetwork"
        net = nef.Network(self, seed=HRLutils.SEED, quick=False)
        
        N = 50
        statelength = math.sqrt(2*stateradius**2)
        tauPSC = 0.007
        num_actions = len(actions)
        init_Qs = 0.0
        weight_save = 600.0 #period to save weights (realtime, not simulation time)
        
        #set up relays
        state_relay = net.make("state_relay", 1, stateD, mode="direct")
        state_relay.fixMode()
        state_relay.addDecodedTermination("input", MU.I(stateD), 0.001, False)
        
        #create state population
        state_fac = HRLutils.node_fac()
        state_fac.setIntercept(IndicatorPDF(0,1))
            
        state_pop = net.make("state_pop", stateN, stateD, 
                              radius=statelength,
                              node_factory=state_fac,
                              encoders=state_encoders)
#                              eval_points=MU.I(stateD))
#        state_pop = net.make_array("state_pop", stateN/stateD, stateD,
#                                   node_factory=state_fac)
        state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])
        
        net.connect(state_relay, state_pop, pstc=tauPSC)
        
        #create population tied to previous state (to be used in learning)
        saved_state = memory.Memory("saved_state", N*4, stateD, inputscale=50, radius=stateradius,
                                    direct_storage=True)
        net.add(saved_state)
        
        net.connect(state_relay, saved_state.getTermination("target"))
        
        old_state_pop = net.make("old_state_pop", stateN, stateD, 
                              radius=statelength,
                              node_factory=state_fac,
                              encoders=state_encoders)
#                              eval_points=MU.I(stateD))
#        old_state_pop = net.make_array("old_state_pop", stateN/stateD, stateD,
#                                   node_factory=state_fac)
        old_state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])
        
        net.connect(saved_state, old_state_pop, pstc=tauPSC)
        
        #set up action nodes
        decoders = state_pop.addDecodedOrigin("init_decoders", [ConstantFunction(stateD,init_Qs)], "AXON").getDecoders()
        actionvals = actionvalues.ActionValues("actionvals", N, stateN, actions, learningrate, Qradius=Qradius, init_decoders=decoders)
        net.add(actionvals)
        
        decoders = old_state_pop.addDecodedOrigin("init_decoders", [ConstantFunction(stateD,init_Qs)], "AXON").getDecoders()
        old_actionvals = actionvalues.ActionValues("old_actionvals", N, stateN, actions, learningrate, Qradius=Qradius, init_decoders=decoders)
        net.add(old_actionvals)
        
        net.connect(state_pop.getOrigin("AXON"), actionvals.getTermination("state"))
        net.connect(old_state_pop.getOrigin("AXON"), old_actionvals.getTermination("state"))
        
        if load_weights != None:
            self.loadWeights(load_weights)
        
            #find error between old_actionvals and actionvals
        valdiff = net.make_array("valdiff", N, num_actions, node_factory = HRLutils.node_fac())
        net.connect(old_actionvals, valdiff, transform=MU.diag([2]*num_actions), pstc=tauPSC)
        net.connect(actionvals, valdiff, transform=MU.diag([-2]*num_actions), pstc=tauPSC)
            #doubling values to get a bigger error signal
        
            #calculate diff between curr_state and saved_state and use that to gate valdiff
        statediff = net.make_array("statediff", N, stateD, intercept=(0.2,1))
        net.connect(state_relay, statediff, pstc=tauPSC)
        net.connect(saved_state, statediff, transform=MU.diag([-1]*stateD), pstc=tauPSC)
        
        net.connect(statediff, valdiff, func=lambda x: [abs(v) for v in x], 
                    transform = [[-10]*stateD for _ in range(valdiff.getNeurons())], pstc=tauPSC)
        
        net.connect(valdiff, actionvals.getTermination("error"))
        
        #periodically save the weights
        class WeightSaveThread(threading.Thread):
            def __init__(self, func, prefix, period):
                threading.Thread.__init__(self)
                self.func = func
                self.prefix = prefix
                self.period = period
                
            def run(self):
                while True:
                    time.sleep(self.period)
                    self.func(self.prefix)
        wsn = WeightSaveThread(self.saveWeights, os.path.join("weights","tmp"), weight_save)
        wsn.start()
        
        self.exposeTermination(state_relay.getTermination("input"), "state")
        self.exposeTermination(old_actionvals.getTermination("error"), "error")
        self.exposeTermination(saved_state.getTermination("transfer"), "save_state")
        self.exposeOrigin(actionvals.getOrigin("X"), "vals")
        self.exposeOrigin(old_actionvals.getOrigin("X"), "old_vals")