Example #1
0
    def __init__(self, N, d, name="PositiveBias"):
        """Builds the PositiveBias network.

        :param N: base number of neurons
        :param d: dimension of input signal
        :param name: name for network
        """

        self.name = name
        net = nef.Network(self, seed=HRLutils.SEED, quick=False)

        tauPSC = 0.007
        biaslevel = 0.03  # the value to be output for negative inputs

        # threshold the input signal to detect positive values
        nfac = HRLutils.node_fac()
        nfac.setIntercept(IndicatorPDF(0, 0.1))
        neg_thresh = net.make_array("neg_thresh",
                                    N,
                                    d,
                                    encoders=[[1]],
                                    node_factory=nfac)
        neg_thresh.addDecodedTermination("input", MU.I(d), tauPSC, False)

        # create a population that tries to output biaslevel across
        # all dimensions
        bias_input = net.make_input("bias_input", [biaslevel])
        bias_pop = net.make_array(
            "bias_pop",
            N,
            d,
            node_factory=HRLutils.node_fac(),
            eval_points=[[x * 0.01] for x in range(0, biaslevel * 200)])

        net.connect(bias_input, bias_pop, pstc=tauPSC)

        # the individual dimensions of bias_pop are then inhibited by the
        # output of neg_thresh (so any positive values don't get the bias)
        net.connect(neg_thresh,
                    bias_pop,
                    pstc=tauPSC,
                    func=lambda x: [1.0] if x[0] > 0 else [0.0],
                    transform=[[-10 if i == k else 0 for k in range(d)]
                               for i in range(d)
                               for _ in range(bias_pop.getNeurons() / d)])

        # the whole population is inhibited by the learn signal, so that it
        # outputs 0 if the system isn't supposed to be learning
        bias_pop.addTermination("learn",
                                [[-10] for _ in range(bias_pop.getNeurons())],
                                tauPSC, False)

        self.exposeTermination(neg_thresh.getTermination("input"), "input")
        self.exposeTermination(bias_pop.getTermination("learn"), "learn")
        self.exposeOrigin(bias_pop.getOrigin("X"), "X")
Example #2
0
    def __init__(self, N, d, name="PositiveBias"):
        """Builds the PositiveBias network.

        :param N: base number of neurons
        :param d: dimension of input signal
        :param name: name for network
        """

        self.name = name
        net = nef.Network(self, seed=HRLutils.SEED, quick=False)

        tauPSC = 0.007
        biaslevel = 0.03  # the value to be output for negative inputs

        # threshold the input signal to detect positive values
        nfac = HRLutils.node_fac()
        nfac.setIntercept(IndicatorPDF(0, 0.1))
        neg_thresh = net.make_array("neg_thresh", N, d, encoders=[[1]],
                                    node_factory=nfac)
        neg_thresh.addDecodedTermination("input", MU.I(d), tauPSC, False)

        # create a population that tries to output biaslevel across
        # all dimensions
        bias_input = net.make_input("bias_input", [biaslevel])
        bias_pop = net.make_array("bias_pop", N, d,
                                  node_factory=HRLutils.node_fac(),
                                  eval_points=[[x * 0.01] for x in
                                               range(0, biaslevel * 200)])

        net.connect(bias_input, bias_pop, pstc=tauPSC)

        # the individual dimensions of bias_pop are then inhibited by the
        # output of neg_thresh (so any positive values don't get the bias)
        net.connect(neg_thresh, bias_pop, pstc=tauPSC,
                    func=lambda x: [1.0] if x[0] > 0 else [0.0],
                    transform=[[-10 if i == k else 0 for k in range(d)]
                               for i in range(d) for _ in
                               range(bias_pop.getNeurons() / d)])

        # the whole population is inhibited by the learn signal, so that it
        # outputs 0 if the system isn't supposed to be learning
        bias_pop.addTermination("learn", [[-10] for _ in
                                          range(bias_pop.getNeurons())],
                                tauPSC, False)

        self.exposeTermination(neg_thresh.getTermination("input"), "input")
        self.exposeTermination(bias_pop.getTermination("learn"), "learn")
        self.exposeOrigin(bias_pop.getOrigin("X"), "X")
Example #3
0
def test_actionvalues():
    net = nef.Network("testActionValues")

    stateN = 200
    N = 100
    stateD = 2
    stateradius = 1.0
    statelength = math.sqrt(2 * stateradius**2)
    init_Qs = 0.5
    learningrate = 0.0
    Qradius = 1
    tauPSC = 0.007
    actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]),
               ("left", [-1, 0])]

    # state
    state_pop = net.make(
        "state_pop",
        stateN,
        stateD,
        radius=statelength,
        node_factory=HRLutils.node_fac(),
        eval_points=[[x / statelength, y / statelength]
                     for x in range(-int(stateradius), int(stateradius))
                     for y in range(-int(stateradius), int(stateradius))])
    state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])
    state_pop.addDecodedTermination("state_input", MU.I(stateD), tauPSC, False)

    # set up action nodes
    decoders = state_pop.addDecodedOrigin("init_decoders",
                                          [ConstantFunction(stateD, init_Qs)],
                                          "AXON").getDecoders()

    actionvals = actionvalues.ActionValues("testActionValues",
                                           N,
                                           stateN,
                                           actions,
                                           learningrate,
                                           Qradius=Qradius,
                                           init_decoders=decoders)
    net.add(actionvals)

    net.connect(state_pop.getOrigin("AXON"),
                actionvals.getTermination("state"))

    # input
    inp = net.make_input("input", [0, 0])
    net.connect(inp, state_pop.getTermination("state_input"))

    net.add_to_nengo()
    net.view()
Example #4
0
def test_actionvalues():
    net = nef.Network("testActionValues")

    stateN = 200
    N = 100
    stateD = 2
    stateradius = 1.0
    statelength = math.sqrt(2 * stateradius ** 2)
    init_Qs = 0.5
    learningrate = 0.0
    Qradius = 1
    tauPSC = 0.007
    actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]), ("left", [-1, 0])]

    # state
    state_pop = net.make(
        "state_pop",
        stateN,
        stateD,
        radius=statelength,
        node_factory=HRLutils.node_fac(),
        eval_points=[
            [x / statelength, y / statelength]
            for x in range(-int(stateradius), int(stateradius))
            for y in range(-int(stateradius), int(stateradius))
        ],
    )
    state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])
    state_pop.addDecodedTermination("state_input", MU.I(stateD), tauPSC, False)

    # set up action nodes
    decoders = state_pop.addDecodedOrigin("init_decoders", [ConstantFunction(stateD, init_Qs)], "AXON").getDecoders()

    actionvals = actionvalues.ActionValues(
        "testActionValues", N, stateN, actions, learningrate, Qradius=Qradius, init_decoders=decoders
    )
    net.add(actionvals)

    net.connect(state_pop.getOrigin("AXON"), actionvals.getTermination("state"))

    # input
    inp = net.make_input("input", [0, 0])
    net.connect(inp, state_pop.getTermination("state_input"))

    net.add_to_nengo()
    net.view()
Example #5
0
    def __init__(self, num_actions, Qradius=1.0, rewardradius=1.0, discount=0.3):
        """Builds the ErrorNetwork.

        :param num_actions: the number of actions available to the system
        :param Qradius: expected radius of Q values
        :param rewardradius: expected radius of reward signal
        :param discount: discount factor
        """

        self.name = "ErrorNetwork"
        net = nef.Network(self, seed=HRLutils.SEED, quick=False)

        N = 50
        tauPSC = 0.007
        errorcap = 0.1 #soft cap on error magnitude (large errors seem to cause problems 
            #with overly-generalizing the learning)

        #set up relays
        vals_relay = net.make("vals_relay", 1, num_actions, mode="direct")
        vals_relay.fixMode()
        vals_relay.addDecodedTermination("input", MU.I(num_actions), 0.001, False)

        old_vals_relay = net.make("old_vals_relay", 1, num_actions, mode="direct")
        old_vals_relay.fixMode()
        old_vals_relay.addDecodedTermination("input", MU.I(num_actions), 0.001, False)

        curr_bg_relay = net.make("curr_bg_relay", 1, num_actions, mode="direct")
        curr_bg_relay.fixMode()
        curr_bg_relay.addDecodedTermination("input", MU.I(num_actions), 0.001, False)

        saved_bg_relay = net.make("saved_bg_relay", 1, num_actions, mode="direct")
        saved_bg_relay.fixMode()
        saved_bg_relay.addDecodedTermination("input", MU.I(num_actions), 0.001, False)


        #select out only the currently chosen Q value
        gatedQ = net.make_array("gatedQ", N, num_actions, node_factory=HRLutils.node_fac(), radius=Qradius)
        gatedQ.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

        net.connect(vals_relay, gatedQ, pstc=tauPSC)

        net.connect(curr_bg_relay, gatedQ,
                    transform=[[-3 if i != k else 0 for k in range(num_actions)]
                               for i in range(num_actions) for _ in range(gatedQ.getNeurons() / num_actions)],
                    pstc=tauPSC)

        currQ = net.make("currQ", 1, 1, mode="direct")
        currQ.fixMode()
        net.connect(gatedQ, currQ, transform=[[1 for _ in range(num_actions)]], pstc=0.001)

        #select out only the previously chosen Q value
        gatedstoreQ = net.make_array("gatedstoreQ", N, num_actions, node_factory=HRLutils.node_fac(), radius=Qradius)
        gatedstoreQ.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

        net.connect(old_vals_relay, gatedstoreQ, pstc=tauPSC)

        net.connect(saved_bg_relay, gatedstoreQ,
                    transform=[[-3 if i != k else 0 for k in range(num_actions)]
                               for i in range(num_actions) for _ in range(gatedstoreQ.getNeurons() / num_actions)],
                    pstc=tauPSC)

        storeQ = net.make("storeQ", 1, 1, mode="direct")
        storeQ.fixMode()
        net.connect(gatedstoreQ, storeQ, transform=[[1 for _ in range(num_actions)]], pstc=0.001)

        #create error calculation network
        error = errorcalc2.ErrorCalc2(discount, rewardradius=rewardradius, Qradius=Qradius)
        net.add(error)

        net.connect(currQ, error.getTermination("currQ"))
        net.connect(storeQ, error.getTermination("storeQ"))

        #gate error by learning signal and saved BG output (we only want error when the
        #system is supposed to be learning, and we only want error related to the action
        #that was selected)
        gatederror = net.make_array("gatederror", N * 2, num_actions, radius=errorcap, node_factory=HRLutils.node_fac())
        gatederror.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

        net.connect(error, gatederror, transform=[[1.0 / Qradius] for _ in range(num_actions)], pstc=tauPSC)
            #scale the error by Qradius, so that we don't get super huge errors (screws up the gating)

        learninggate = net.make("learninggate", N, 1, node_factory=HRLutils.node_fac())
        learninggate.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])
        learninggate.addTermination("gate", [[-10] for _ in range(N)], tauPSC, False)

        net.connect(learninggate, gatederror, func=lambda x: [1.0],
                    transform=[[-12] for _ in range(gatederror.getNeurons())], pstc=tauPSC)

        net.connect(saved_bg_relay, gatederror,
                    transform=[[-12 if i != k else 0 for k in range(num_actions)]
                               for i in range(num_actions) for _ in range(gatederror.getNeurons() / num_actions)],
                    pstc=tauPSC)

        #add a positive bias to the error anywhere the Q values are negative (to stop
        #Q values from getting too negative, which screws up the action selection)
        posbias = positivebias.PositiveBias(N, num_actions)
        net.add(posbias)
        net.connect(old_vals_relay, posbias.getTermination("input"))
        net.connect(learninggate, posbias.getTermination("learn"), func=lambda x: [1.0])

        biasederror = net.make("biasederror", 1, num_actions, mode="direct")
        biasederror.fixMode()
        net.connect(gatederror, biasederror, pstc=0.001)
        net.connect(posbias, biasederror, pstc=0.001)

        self.exposeTermination(curr_bg_relay.getTermination("input"), "curr_bg_input")
        self.exposeTermination(saved_bg_relay.getTermination("input"), "saved_bg_input")
        self.exposeTermination(vals_relay.getTermination("input"), "vals")
        self.exposeTermination(old_vals_relay.getTermination("input"), "old_vals")
        self.exposeTermination(error.getTermination("reward"), "reward")
        self.exposeTermination(error.getTermination("reset"), "reset")
        self.exposeTermination(learninggate.getTermination("gate"), "learn")
        self.exposeOrigin(biasederror.getOrigin("X"), "error")
Example #6
0
    def __init__(self, discount, rewardradius=1.0, Qradius=1.0):
        """Builds the ErrorCalc2 network.

        :param discount: discount factor, controls rate of integration
        :param rewardradius: expected radius of reward value
        :param Qradius: expected radius of Q values
        """

        self.name = "ErrorCalc"
        net = nef.Network(self, seed=HRLutils.SEED, quick=False)

        tauPSC = 0.007
        intPSC = 0.1
        N = 50

        # relay for current Q input
        currQ = net.make("currQ", 1, 1, node_factory=HRLutils.node_fac(), mode="direct",
                         radius=Qradius)
        currQ.fixMode()
        currQ.addDecodedTermination("input", [[1]], 0.001, False)

        # input population for resetting the network
        reset_nodefac = HRLutils.node_fac()
        reset_nodefac.setIntercept(IndicatorPDF(0.3, 1.0))
        reset = net.make("reset", N, 1, encoders=[[1]], node_factory=reset_nodefac)
        reset.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])
            # this population will begin outputting a value once the reset
            # signal exceeds the threshold, and that output will then be
            # used to reset the rest of the network

        reset.addDecodedTermination("input", [[1]], tauPSC, False)

        # relay for stored previous value of Q
        storeQ = net.make("storeQ", 1, 1, node_factory=HRLutils.node_fac(), mode="direct",
                          radius=Qradius)
        storeQ.fixMode()
        storeQ.addDecodedTermination("input", [[1]], 0.001, False)

        #calculate "discount" by integrating output of storeQ   
        acc_storeQ = memory.Memory("acc_storeQ", N * 8, 1, inputscale=50)
        net.add(acc_storeQ)

        zero_input = net.make_input("zero_input", [0])

        net.connect(zero_input, acc_storeQ.getTermination("target"))
        net.connect(reset, acc_storeQ.getTermination("transfer"))

        # threshold storeQ value so it won't go below zero.  that is, if we have
        # negative Q values, we don't want to have a negative discount, or that will just drive
        # the highest (negative) Q value upwards, and it will always be selected.  negative Q
        # values are instead pushed upwards by the PositiveBias mechanism.  
        Qthresh = net.make("Qthresh", N * 2, 1, encoders=[[1]], eval_points=[[x * 0.001] for x in range(1000)],
                           radius=Qradius, intercept=(0, 1))
        net.connect(storeQ, Qthresh, pstc=tauPSC)
        net.connect(Qthresh, acc_storeQ, pstc=intPSC,
                    transform=[[discount * intPSC]], func=lambda x: max(x[0], 0.0))

        # accumulate  reward
        reward = memory.Memory("reward", N * 4, 1, radius=rewardradius, inputscale=50)
        net.add(reward)

        reward.addDecodedTermination("input", [[intPSC]], intPSC, False)

        net.connect(zero_input, reward.getTermination("target"))
        net.connect(reset, reward.getTermination("transfer"))

        # put reward, currQ, storeQ, and discount together to calculate error
        error = net.make("error", N * 2, 1, node_factory=HRLutils.node_fac())

        net.connect(currQ, error, pstc=tauPSC)
        net.connect(reward, error, pstc=tauPSC)
        net.connect(storeQ, error, pstc=tauPSC, transform=[[-1]])
        net.connect(acc_storeQ, error, pstc=tauPSC, transform=[[-1]])

        self.exposeTermination(reward.getTermination("input"), "reward")
        self.exposeTermination(reset.getTermination("input"), "reset")
        self.exposeTermination(currQ.getTermination("input"), "currQ")
        self.exposeTermination(storeQ.getTermination("input"), "storeQ")
        self.exposeOrigin(error.getOrigin("X"), "X")
Example #7
0
    def __init__(self, name, N, d, radius=1.0, inputscale=1.0, recurweight=1.0,
                 direct_storage=False):
        """Builds the Memory network.

        :param name: name of network
        :param N: base number of neurons
        :param d: dimension of stored value
        :param radius: radius of stored value
        :param inputscale: controls how fast the stored value moves to the
            target
        :param recurweight: controls the preservation of the stored value
        :param direct_storage: if True, use directmode for the memory
        """

        self.name = name
        net = nef.Network(self, seed=HRLutils.SEED, quick=False)
        self.dimension = d
        self.radius = radius

        tauPSC = 0.007
        intPSC = 0.1

        # population that will store the value
        if not direct_storage:
            storage = net.make_array("storage", N, d,
                                     node_factory=HRLutils.node_fac(),
                                     eval_points=[[x * 0.001]
                                                  for x in range(-1000, 1000)])
        else:
            storage = net.make("storage", 1, d, mode="direct")
            storage.fixMode()

        net.connect(storage, storage, transform=MU.diag([recurweight
                                                         for _ in range(d)]),
                    pstc=intPSC)

        # storageinput will represent (target - stored_value), which when used
        # as input to storage will drive the stored value to target
        storageinput = net.make_array("storageinput", N, d,
                                      node_factory=HRLutils.node_fac())
        storageinput.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

        storageinput.addDecodedTermination("target",
                                           MU.diag([1.0 / radius
                                                    for _ in range(d)]),
                                           tauPSC, False)
        # note: store everything in -1 -- 1 range by dividing by radius

        # scale storageinput value by inputscale to control rate at which
        # it moves to the target
        net.connect(storageinput, storage, pstc=intPSC,
                    transform=MU.diag([inputscale * intPSC for _ in range(d)]))

        # subtract currently stored value
        net.connect(storage, storageinput, pstc=tauPSC,
                    transform=MU.diag([-1 for _ in range(d)]))

        # we want to open the input gate when the transfer signal arrives (to
        # transfer storageinput to storage). using a double inhibition setup
        # (rather than just feeding it e.g. the the inverse of the transfer
        # signal) so that we get a nice clean zero

        # this inhibits the storageinput population (to block input to the
        # storage)
        transferinhib = net.make("transferinhib", N, 1,
                                 node_factory=HRLutils.node_fac())
        transferinhib.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

        transferinhib.addTermination("gate",
                                     [[-10] for _ in
                                      range(transferinhib.getNeurons())],
                                     tauPSC, False)

        net.connect(transferinhib, storageinput, pstc=tauPSC,
                    transform=[[-10] for _ in
                               range(storageinput.getNeurons())])

        # this drives the transferinhib population (so that by default it will
        # block any input). inhibiting transferinhib will thus remove the
        # inhibition on storageinput, and change the stored value
        biasinput = net.make_input("biasinput", [1])

        net.connect(biasinput, transferinhib, pstc=tauPSC)

        # output population (to undo radius scaling)
        storageoutput = net.make("storageoutput", 1, d, mode="direct")
        storageoutput.fixMode()
        net.connect(storage, storageoutput, pstc=0.001,
                    transform=MU.diag([radius for _ in range(d)]))

        self.exposeTermination(transferinhib.getTermination("gate"),
                               "transfer")
        self.exposeTermination(storageinput.getTermination("target"), "target")
        self.exposeOrigin(storageoutput.getOrigin("X"), "X")
Example #8
0
    def __init__(self,
                 num_actions,
                 Qradius=1.0,
                 rewardradius=1.0,
                 discount=0.3):
        """Builds the ErrorNetwork.

        :param num_actions: the number of actions available to the system
        :param Qradius: expected radius of Q values
        :param rewardradius: expected radius of reward signal
        :param discount: discount factor
        """

        self.name = "ErrorNetwork"
        net = nef.Network(self, seed=HRLutils.SEED, quick=False)

        N = 50
        tauPSC = 0.007
        errorcap = 0.1  # soft cap on error magnitude (large errors seem to
        # cause problems with overly-generalizing the learning)

        # set up relays
        vals_relay = net.make("vals_relay", 1, num_actions, mode="direct")
        vals_relay.fixMode()
        vals_relay.addDecodedTermination("input", MU.I(num_actions), 0.001,
                                         False)

        old_vals_relay = net.make("old_vals_relay",
                                  1,
                                  num_actions,
                                  mode="direct")
        old_vals_relay.fixMode()
        old_vals_relay.addDecodedTermination("input", MU.I(num_actions), 0.001,
                                             False)

        curr_bg_relay = net.make("curr_bg_relay",
                                 1,
                                 num_actions,
                                 mode="direct")
        curr_bg_relay.fixMode()
        curr_bg_relay.addDecodedTermination("input", MU.I(num_actions), 0.001,
                                            False)

        saved_bg_relay = net.make("saved_bg_relay",
                                  1,
                                  num_actions,
                                  mode="direct")
        saved_bg_relay.fixMode()
        saved_bg_relay.addDecodedTermination("input", MU.I(num_actions), 0.001,
                                             False)

        # select out only the currently chosen Q value
        gatedQ = net.make_array("gatedQ",
                                N * 2,
                                num_actions,
                                node_factory=HRLutils.node_fac(),
                                radius=Qradius)
        gatedQ.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

        net.connect(vals_relay, gatedQ, pstc=tauPSC)

        net.connect(
            curr_bg_relay,
            gatedQ,
            transform=[[-3 if i != k else 0 for k in range(num_actions)]
                       for i in range(num_actions)
                       for _ in range(gatedQ.getNeurons() / num_actions)],
            pstc=tauPSC)

        currQ = net.make("currQ", 1, 1, mode="direct")
        currQ.fixMode()
        net.connect(gatedQ,
                    currQ,
                    transform=[[1 for _ in range(num_actions)]],
                    pstc=0.001)

        # select out only the previously chosen Q value
        gatedstoreQ = net.make_array("gatedstoreQ",
                                     N * 2,
                                     num_actions,
                                     node_factory=HRLutils.node_fac(),
                                     radius=Qradius)
        gatedstoreQ.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

        net.connect(old_vals_relay, gatedstoreQ, pstc=tauPSC)

        net.connect(
            saved_bg_relay,
            gatedstoreQ,
            transform=[[-3 if i != k else 0 for k in range(num_actions)]
                       for i in range(num_actions)
                       for _ in range(gatedstoreQ.getNeurons() / num_actions)],
            pstc=tauPSC)

        storeQ = net.make("storeQ", 1, 1, mode="direct")
        storeQ.fixMode()
        net.connect(gatedstoreQ,
                    storeQ,
                    transform=[[1 for _ in range(num_actions)]],
                    pstc=0.001)

        # create error calculation network
        error = errorcalc2.ErrorCalc2(discount,
                                      rewardradius=rewardradius,
                                      Qradius=Qradius)
        net.add(error)

        net.connect(currQ, error.getTermination("currQ"))
        net.connect(storeQ, error.getTermination("storeQ"))

        # gate error by learning signal and saved BG output (we only want error
        # when the system is supposed to be learning, and we only want error
        # related to the action that was selected)
        gatederror = net.make_array("gatederror",
                                    N * 2,
                                    num_actions,
                                    radius=errorcap,
                                    node_factory=HRLutils.node_fac())
        gatederror.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

        net.connect(error,
                    gatederror,
                    transform=[[1.0 / Qradius] for _ in range(num_actions)],
                    pstc=tauPSC)
        # scale the error by Qradius, so that we don't get super huge errors
        # (causes problems with the gating)

        learninggate = net.make("learninggate",
                                N,
                                1,
                                node_factory=HRLutils.node_fac())
        learninggate.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])
        learninggate.addTermination("gate", [[-10] for _ in range(N)], tauPSC,
                                    False)

        net.connect(learninggate,
                    gatederror,
                    func=lambda x: [1.0],
                    transform=[[-12] for _ in range(gatederror.getNeurons())],
                    pstc=tauPSC)

        net.connect(
            saved_bg_relay,
            gatederror,
            transform=[[-12 if i != k else 0 for k in range(num_actions)]
                       for i in range(num_actions)
                       for _ in range(gatederror.getNeurons() / num_actions)],
            pstc=tauPSC)

        # add a positive bias to the error anywhere the Q values are negative
        # (to stop Q values from getting too negative, which causes problems
        # with the action selection)
        posbias = positivebias.PositiveBias(N, num_actions)
        net.add(posbias)
        net.connect(old_vals_relay, posbias.getTermination("input"))
        net.connect(learninggate,
                    posbias.getTermination("learn"),
                    func=lambda x: [1.0])

        biasederror = net.make("biasederror", 1, num_actions, mode="direct")
        biasederror.fixMode()
        net.connect(gatederror, biasederror, pstc=0.001)
        net.connect(posbias, biasederror, pstc=0.001)

        self.exposeTermination(curr_bg_relay.getTermination("input"),
                               "curr_bg_input")
        self.exposeTermination(saved_bg_relay.getTermination("input"),
                               "saved_bg_input")
        self.exposeTermination(vals_relay.getTermination("input"), "vals")
        self.exposeTermination(old_vals_relay.getTermination("input"),
                               "old_vals")
        self.exposeTermination(error.getTermination("reward"), "reward")
        self.exposeTermination(error.getTermination("reset"), "reset")
        self.exposeTermination(learninggate.getTermination("gate"), "learn")
        self.exposeOrigin(biasederror.getOrigin("X"), "error")
Example #9
0
    def __init__(self, actions, Qradius=1, noiselevel=0.03):
        """Builds the BGNetwork.

        :param actions: actions available to the system
            :type actions: list of tuples (action_name,action_vector)
        :param Qradius: expected radius of Q values
        :param noiselevel: standard deviation of noise added to Q values for
            exploration
        """

        self.name = "BGNetwork"
        net = nef.Network(self, seed=HRLutils.SEED, quick=False)

        self.N = 50
        self.d = len(actions)
        self.mut_inhib = 1.0  # mutual inhibition between actions
        self.tauPSC = 0.007

        # make basal ganglia
        netbg = nef.Network("bg")

        bginput = netbg.make("bginput", 1, self.d, mode="direct")
        bginput.fixMode()
        bginput.addDecodedTermination("input",
                                      MU.diag([1.0 / Qradius for _ in
                                               range(self.d)]), 0.001, False)
        # divide by Q radius to get values back into 0 -- 1 range

        bgoutput = netbg.make("bgoutput", 1, self.d, mode="direct")
        bgoutput.fixMode()

        basalganglia.make_basal_ganglia(netbg, bginput, bgoutput,
                                        dimensions=self.d, neurons=200)
        bg = netbg.network
        net.add(bg)
        bg.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

        bg.exposeTermination(bginput.getTermination("input"), "input")
        bg.exposeOrigin(bgoutput.getOrigin("X"), "X")

        # insert noise (used to give some randomness to drive exploration)
        noiselevel = net.make_input("noiselevel", [noiselevel])

        noise = noisenode.NoiseNode(1, dimension=len(actions))
        net.add(noise)

        net.connect(noiselevel, noise.getTermination("scale"))
        net.connect(noise.getOrigin("noise"), "bg.bginput", pstc=0.001)

        # add bias to shift everything up to 0.5--1.5
        biasinput = net.make_input("biasinput", [0.5])
        net.connect(biasinput, "bg.bginput",
                    transform=[[1] for _ in range(self.d)], pstc=0.001)

        # invert BG output (so the "selected" action will have a positive value
        # and the rest zero)
        invert = thalamus.make(net, name="invert", neurons=self.N,
                               dimensions=self.d, useQuick=False)
        invert.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])
        net.connect(bg, invert.getTermination("bg_input"))

        # add mutual inhibition
        net.connect(invert.getOrigin("xBiased"), invert, pstc=self.tauPSC,
                    transform=[[0 if i == j else -self.mut_inhib
                                for j in range(self.d)]
                               for i in range(self.d)])

        # threshold output values so that you get a nice clean 0 for
        # non-selected and 1 for selected
        threshf = HRLutils.node_fac()
        threshold = 0.1
        threshf.setIntercept(IndicatorPDF(threshold, 1.0))
        val_threshold = net.make_array("val_threshold", self.N * 2, self.d,
                                       node_factory=threshf, encoders=[[1]])
        val_threshold.addDecodedOrigin(
            "output",
            [PiecewiseConstantFunction([threshold], [0, 1])
             for _ in range(self.d)], "AXON", True)

        net.connect(invert.getOrigin("xBiased"), val_threshold,
                    pstc=self.tauPSC)

        # output action (action vectors weighted by BG output)
        weight_actions = net.make_array("weight_actions", 50,
                                        len(actions[0][1]), intercept=(0, 1))
        net.connect(val_threshold.getOrigin("output"), weight_actions,
                    transform=MU.transpose([actions[i][1]
                                            for i in range(self.d)]),
                    pstc=0.007)

        # save the BG output (selected action and selected action value)
        save_relay = net.make("save_relay", 1, 1, mode="direct")
        save_relay.fixMode()
        save_relay.addDecodedTermination("input", [[1]], 0.001, False)

        saved_action = memory.Memory("saved_action", self.N * 2,
                                     len(actions[0][1]), inputscale=75)
        net.add(saved_action)
        net.connect(weight_actions, saved_action.getTermination("target"))
        net.connect(save_relay, saved_action.getTermination("transfer"))

        saved_vals = memory.Memory("saved_values", self.N * 2, self.d,
                                   inputscale=75)
        net.add(saved_vals)
        net.connect(val_threshold.getOrigin("output"),
                    saved_vals.getTermination("target"))
        net.connect(save_relay, saved_vals.getTermination("transfer"))

        # put the saved values through a threshold (we want a nice clean
        # zero for non-selected values)
        nfac = HRLutils.node_fac()
        nfac.setIntercept(IndicatorPDF(0.2, 1))
        saved_vals_threshold = net.make_array("saved_vals_threshold", self.N,
                                              self.d, node_factory=nfac,
                                              encoders=[[1]])
        saved_vals_threshold.addDecodedOrigin(
            "output", [PiecewiseConstantFunction([0.3], [0, 1])
                       for _ in range(self.d)], "AXON", True)

        net.connect(saved_vals, saved_vals_threshold, pstc=self.tauPSC)

        self.exposeTermination(bg.getTermination("input"), "input")
        self.exposeTermination(save_relay.getTermination("input"),
                               "save_output")
        self.exposeOrigin(val_threshold.getOrigin("output"), "curr_vals")
        self.exposeOrigin(weight_actions.getOrigin("X"), "curr_action")
        self.exposeOrigin(saved_vals_threshold.getOrigin("output"),
                          "saved_vals")
        self.exposeOrigin(saved_action.getOrigin("X"), "saved_action")
Example #10
0
    def __init__(self, stateN, stateD, state_encoders, actions, learningrate,
                stateradius=1.0, Qradius=1.0,
                load_weights=None, state_evals=None, state_threshold=0.0):
        """Builds the QNetwork.

        :param stateN: number of neurons to use to represent state
        :param stateD: dimension of state vector
        :param state_encoders: encoders to use for neurons in state population
        :param actions: actions available to the system
            :type actions: list of tuples (action_name,action_vector)
        :param learningrate: learningrate for action value learning rule
        :param stateradius: expected radius of state values
        :param Qradius: expected radius of Q values
        :param load_weights: filename to load Q value weights from
        :param state_evals: evaluation points to use for state population.
            This is used when initializing the Q values (may be necessary if the
            input states don't tend to fall in the hypersphere).
        :param state_threshold: threshold of state neurons (minimum intercept)
        """

        self.name = "QNetwork"
        net = nef.Network(self, seed=HRLutils.SEED, quick=False)

        N = 50
        tauPSC = 0.007
        num_actions = len(actions)
        init_Qs = 0.2 #initial value for all Q values
        self.neuron_learning = False
        # if True, use neuron--neuron weight learning,
        # otherwise, use decoder learning

        # set up relays
        state_relay = net.make("state_relay", 1, stateD, mode="direct")
        state_relay.fixMode() # This apparently fixes the simulator mode to the curremt mode, so I'm guessing we just don't want it over-ridden by an over-zealous config file.
        state_relay.addDecodedTermination("input", MU.I(stateD), 0.001, False)

        # create state population
        state_fac = HRLutils.node_fac()
        state_fac.setIntercept(IndicatorPDF(state_threshold, 1.0))

        print("making the state_pop")
        state_pop = net.make("state_pop", stateN, stateD,
                              radius=stateradius,
                              node_factory=state_fac,
                              encoders=state_encoders,
                              eval_points=state_evals)
        state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

        net.connect(state_relay, state_pop, pstc=tauPSC)

        # store the state value (used to drive population encoding previous state)
        print("create the saved state memory")
        saved_state = memory.Memory("saved_state", N * 4, stateD, inputscale=50, radius=stateradius,
                                    direct_storage=True)
        net.add(saved_state)

        net.connect(state_relay, saved_state.getTermination("target"))

        # create population representing previous state
        old_state_pop = net.make("old_state_pop", stateN, stateD,
                              radius=stateradius,
                              node_factory=state_fac,
                              encoders=state_encoders,
                              eval_points=state_evals)
        old_state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

        net.connect(saved_state, old_state_pop, pstc=tauPSC)

        print("setup the action nodes")
        # set up action nodes
        if self.neuron_learning:
            # use ActionValues network to compute Q values

            # current Q values
            decoders = state_pop.addDecodedOrigin("init_decoders",
                                                  [ConstantFunction(stateD, init_Qs)], "AXON").getDecoders()
            actionvals = actionvalues.ActionValues("actionvals", N, stateN, actions, learningrate,
                                                   Qradius=Qradius, init_decoders=decoders)
            net.add(actionvals)

            net.connect(state_pop.getOrigin("AXON"), actionvals.getTermination("state"))

            # Q values of previous state
            decoders = old_state_pop.addDecodedOrigin("init_decoders",
                                                      [ConstantFunction(stateD, init_Qs)], "AXON").getDecoders()
            old_actionvals = actionvalues.ActionValues("old_actionvals", N, stateN, actions, learningrate,
                                                       Qradius=Qradius, init_decoders=decoders)
            net.add(old_actionvals)

            net.connect(old_state_pop.getOrigin("AXON"), old_actionvals.getTermination("state"))
        else:
            # just use decoder on state population to compute Q values

            # current Q values
            origin = state_pop.addDecodedOrigin("vals",
                                        [ConstantFunction(num_actions, init_Qs) for _ in range(num_actions)],
                                        "AXON")
            state_dlnode = decoderlearningnode.DecoderLearningNode(state_pop, origin, learningrate,
                                                                   num_actions, name="state_learningnode")
            net.add(state_dlnode)

            # just a little relay node, so that things match up for the rest of the script 
            # when you have the neuron -- neuron learning
            actionvals = net.make("actionvals", 1, num_actions, mode="direct")
            actionvals.fixMode()
            net.connect(origin, actionvals, pstc=0.001)

            # Q values of previous state
            origin = old_state_pop.addDecodedOrigin("vals",
                                        [ConstantFunction(num_actions, init_Qs) for _ in range(num_actions)],
                                        "AXON")
            old_state_dlnode = decoderlearningnode.DecoderLearningNode(old_state_pop, origin, learningrate, num_actions, name="old_state_learningnode")
            net.add(old_state_dlnode)

            old_actionvals = net.make("old_actionvals", 1, num_actions, mode="direct")
            old_actionvals.fixMode()
            net.connect(origin, old_actionvals, pstc=0.001)

        if load_weights != None:
            self.loadParams(load_weights)

        # find error between old_actionvals and actionvals (this will be used to drive learning
        # on the new actionvals)
        valdiff = net.make_array("valdiff", N, num_actions, node_factory=HRLutils.node_fac())
        net.connect(old_actionvals, valdiff, transform=MU.diag([2] * num_actions), pstc=tauPSC)
        net.connect(actionvals, valdiff, transform=MU.diag([-2] * num_actions), pstc=tauPSC)
            # doubling the values to get a bigger error signal

        # calculate diff between curr_state and saved_state and use that to gate valdiff (we
        # only want to train the curr state based on previous state when the two have similar
        # values) # WTF does that mean and what is with these weird intercept
        statediff = net.make_array("statediff", N, stateD, intercept=(0.2, 1))
            # note: threshold > 0 so that there is a deadzone in the middle (when the states
            # are similar) where there will be no output inhibition
        net.connect(state_relay, statediff, pstc=tauPSC)
        net.connect(saved_state, statediff, transform=MU.diag([-1] * stateD), pstc=tauPSC)

        net.connect(statediff, valdiff, func=lambda x: [abs(v) for v in x],
                    transform=[[-10] * stateD for _ in range(valdiff.getNeurons())], pstc=tauPSC)

        # connect up valdiff to the error signal for current Q values, and expose
        # the error signal for the previous Q values to the external error
        if self.neuron_learning:
            net.connect(valdiff, actionvals.getTermination("error"))
            self.exposeTermination(old_actionvals.getTermination("error"), "error")
        else:
            net.connect(valdiff, state_dlnode.getTermination("error"))
            self.exposeTermination(old_state_dlnode.getTermination("error"), "error")

        self.exposeTermination(state_relay.getTermination("input"), "state")
        self.exposeTermination(saved_state.getTermination("transfer"), "save_state")
        self.exposeOrigin(actionvals.getOrigin("X"), "vals")
        self.exposeOrigin(old_actionvals.getOrigin("X"), "old_vals")
Example #11
0
    def __init__(self,
                 name,
                 N,
                 stateN,
                 actions,
                 learningrate,
                 Qradius=1.0,
                 init_decoders=None):
        """Build ActionValues network.

        :param name: name of Network
        :param N: base number of neurons
        :param stateN: number of neurons in state population
        :param actions: actions available to the system
            :type actions: list of tuples (action_name,action_vector)
        :param learningrate: learning rate for PES rule
        :param Qradius: expected radius of Q values
        :param init_decoders: if specified, will be used to initialize the
            connection weights to whatever function is specified by decoders
        """

        self.name = name
        net = nef.Network(self, seed=HRLutils.SEED, quick=False)

        self.N = N
        self.learningrate = learningrate
        self.supervision = 1.0  # don't use the unsupervised stuff at all

        self.tauPSC = 0.007

        modterms = []
        learnterms = []

        # relays
        output = net.make("output", 1, len(actions), mode="direct")
        output.fixMode()

        for i, action in enumerate(actions):
            # create one population corresponding to each action
            act_pop = net.make("action_" + action[0],
                               self.N * 4,
                               1,
                               node_factory=HRLutils.node_fac())
            act_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

            # add error termination
            modterm = act_pop.addDecodedTermination(
                "error", [[0 if j != i else 1 for j in range(len(actions))]],
                0.005, True)
            # set modulatory transform so that it selects one dimension of
            # the error signal

            # create learning termination
            if init_decoders is not None:
                weights = MU.prod(act_pop.getEncoders(),
                                  MU.transpose(init_decoders))
            else:
                weights = [[
                    random.uniform(-1e-3, 1e-3) for j in range(stateN)
                ] for i in range(act_pop.getNeurons())]
            learningterm = act_pop.addHPESTermination("learning", weights,
                                                      0.005, False, None)

            # initialize the learning rule
            net.learn(act_pop,
                      learningterm,
                      modterm,
                      rate=self.learningrate,
                      supervisionRatio=self.supervision)

            # connect each action back to output relay
            net.connect(act_pop.getOrigin("X"),
                        output,
                        transform=[[0] if j != i else [Qradius]
                                   for j in range(len(actions))],
                        pstc=0.001)
            # note, we learn all the Q values with radius 1, then just
            # multiply by the desired Q radius here

            modterms += [modterm]
            learnterms += [learningterm]

        # use EnsembleTerminations to group the individual action terminations
        # into one multi-dimensional termination
        self.exposeTermination(EnsembleTermination(self, "state", learnterms),
                               "state")
        self.exposeTermination(EnsembleTermination(self, "error", modterms),
                               "error")

        self.exposeOrigin(output.getOrigin("X"), "X")
Example #12
0
    def __init__(self, name, N, stateN, actions, learningrate, Qradius=1.0, init_decoders=None):
        """Build ActionValues network.

        :param name: name of Network
        :param N: base number of neurons
        :param stateN: number of neurons in state population
        :param actions: actions available to the system
            :type actions: list of tuples (action_name,action_vector)
        :param learningrate: learning rate for PES rule
        :param Qradius: expected radius of Q values
        :param init_decoders: if specified, will be used to initialize the connection
            weights to whatever function is specified by the decoders
        """

        self.name = name
        net = nef.Network(self, seed=HRLutils.SEED, quick=False)

        self.N = N
        self.learningrate = learningrate
        self.supervision = 1.0 # don't use the unsupervised stuff at all

        self.tauPSC = 0.007

        modterms = []
        learnterms = []

        # relays  
        output = net.make("output", 1, len(actions), mode="direct")
        output.fixMode()

        for i, action in enumerate(actions):
            # create one population corresponding to each action
            act_pop = net.make("action_" + action[0], self.N * 4, 1, node_factory=HRLutils.node_fac())
            act_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

            # add error termination
            modterm = act_pop.addDecodedTermination("error", [[0 if j != i else 1 for j in range(len(actions))]],
                                                    0.005, True)
                # set modulatory transform so that it selects one dimension of the error signal

            # create learning termination
            if init_decoders != None:
                weights = MU.prod(act_pop.getEncoders(), MU.transpose(init_decoders))
            else:
                weights = [[random.uniform(-1e-3, 1e-3) for j in range(stateN)] for i in range(act_pop.getNeurons())]
            learningterm = act_pop.addHPESTermination("learning", weights, 0.005, False, None)

            # initialize the learning rule
            net.learn(act_pop, learningterm, modterm, rate=self.learningrate, supervisionRatio=self.supervision)

            # connect each action back to output relay  
            net.connect(act_pop.getOrigin("X"), output, transform=[[0] if j != i else [Qradius] for j in range(len(actions))],
                        pstc=0.001)
                # note, we learn all the Q values with radius 1, then just multiply by the desired Q radius here

            modterms += [modterm]
            learnterms += [learningterm]

        # use EnsembleTerminations to group the individual action terminations into one multi-dimensional termination
        self.exposeTermination(EnsembleTermination(self, "state", learnterms), "state")
        self.exposeTermination(EnsembleTermination(self, "error", modterms), "error")

        self.exposeOrigin(output.getOrigin("X"), "X")
Example #13
0
    def __init__(self,
                 stateN,
                 stateD,
                 state_encoders,
                 actions,
                 learningrate,
                 stateradius=1.0,
                 Qradius=1.0,
                 load_weights=None,
                 state_evals=None,
                 state_threshold=(0.0, 1.0),
                 statediff_threshold=0.2,
                 init_Qs=None):
        """Builds the QNetwork.

        :param stateN: number of neurons to use to represent state
        :param stateD: dimension of state vector
        :param state_encoders: encoders to use for neurons in state population
        :param actions: actions available to the system
            :type actions: list of tuples (action_name,action_vector)
        :param learningrate: learningrate for action value learning rule
        :param stateradius: expected radius of state values
        :param Qradius: expected radius of Q values
        :param load_weights: filename to load Q value weights from
        :param state_evals: evaluation points to use for state population.
            This is used when initializing the Q values (may be necessary if
            the input states don't tend to fall in the hypersphere).
        :param state_threshold: threshold range of state neurons
        :param statediff_threshold: maximum state difference for dual training
        :param init_Qs: initial Q values
        """

        self.name = "QNetwork"
        net = nef.Network(self, seed=HRLutils.SEED, quick=False)

        N = 50
        tauPSC = 0.007
        num_actions = len(actions)
        init_Qs = [0.2] * num_actions if init_Qs is None else init_Qs

        # if True, use neuron--neuron weight learning, otherwise, use decoder
        # learning
        self.neuron_learning = False

        # set up relays
        state_relay = net.make("state_relay", 1, stateD, mode="direct")
        state_relay.fixMode()
        state_relay.addDecodedTermination("input", MU.I(stateD), 0.001, False)

        # create state population
        state_fac = HRLutils.node_fac()
        if isinstance(state_threshold, (float, int)):
            state_threshold = (state_threshold, 1.0)
        state_fac.setIntercept(
            IndicatorPDF(state_threshold[0], state_threshold[1]))

        state_pop = net.make("state_pop",
                             stateN,
                             stateD,
                             radius=stateradius,
                             node_factory=state_fac,
                             encoders=state_encoders,
                             eval_points=state_evals)
        state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

        net.connect(state_relay, state_pop, pstc=tauPSC)

        # store the state value (used to drive population encoding previous
        # state)
        saved_state = memory.Memory("saved_state",
                                    N * 4,
                                    stateD,
                                    inputscale=50,
                                    radius=stateradius,
                                    direct_storage=True)
        net.add(saved_state)

        net.connect(state_relay, saved_state.getTermination("target"))

        # create population representing previous state
        old_state_pop = net.make("old_state_pop",
                                 stateN,
                                 stateD,
                                 radius=stateradius,
                                 node_factory=state_fac,
                                 encoders=state_encoders,
                                 eval_points=state_evals)
        old_state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])

        net.connect(saved_state, old_state_pop, pstc=tauPSC)

        # set up action nodes
        if self.neuron_learning:
            # use ActionValues network to compute Q values

            # current Q values
            decoders = state_pop.addDecodedOrigin(
                "init_decoders", [ConstantFunction(stateD, init_Qs)],
                "AXON").getDecoders()
            actionvals = actionvalues.ActionValues("actionvals",
                                                   N,
                                                   stateN,
                                                   actions,
                                                   learningrate,
                                                   Qradius=Qradius,
                                                   init_decoders=decoders)
            net.add(actionvals)

            net.connect(state_pop.getOrigin("AXON"),
                        actionvals.getTermination("state"))

            # Q values of previous state
            decoders = old_state_pop.addDecodedOrigin(
                "init_decoders", [ConstantFunction(stateD, init_Qs)],
                "AXON").getDecoders()
            old_actionvals = actionvalues.ActionValues("old_actionvals",
                                                       N,
                                                       stateN,
                                                       actions,
                                                       learningrate,
                                                       Qradius=Qradius,
                                                       init_decoders=decoders)
            net.add(old_actionvals)

            net.connect(old_state_pop.getOrigin("AXON"),
                        old_actionvals.getTermination("state"))
        else:
            # just use decoder on state population to compute Q values

            # current Q values
            origin = state_pop.addDecodedOrigin("vals", [
                ConstantFunction(num_actions, init_Qs[i])
                for i in range(num_actions)
            ], "AXON")
            state_dlnode = decoderlearningnode.DecoderLearningNode(
                state_pop,
                origin,
                learningrate,
                num_actions,
                name="state_learningnode")
            net.add(state_dlnode)

            # just a little relay node, so that things match up for the rest of
            # the script when you have the neuron -- neuron learning
            actionvals = net.make("actionvals", 1, num_actions, mode="direct")
            actionvals.fixMode()
            net.connect(origin, actionvals, pstc=0.001)

            # Q values of previous state
            origin = old_state_pop.addDecodedOrigin("vals", [
                ConstantFunction(num_actions, init_Qs[i])
                for i in range(num_actions)
            ], "AXON")
            old_state_dlnode = decoderlearningnode.DecoderLearningNode(
                old_state_pop,
                origin,
                learningrate,
                num_actions,
                name="old_state_learningnode")
            net.add(old_state_dlnode)

            old_actionvals = net.make("old_actionvals",
                                      1,
                                      num_actions,
                                      mode="direct")
            old_actionvals.fixMode()
            net.connect(origin, old_actionvals, pstc=0.001)

        if load_weights is not None:
            self.loadParams(load_weights)

        # find error between old_actionvals and actionvals (this will be used
        # to drive learning on the new actionvals)
        valdiff = net.make_array("valdiff",
                                 N,
                                 num_actions,
                                 node_factory=HRLutils.node_fac())
        # doubling the values to get a bigger error signal
        net.connect(old_actionvals,
                    valdiff,
                    transform=MU.diag([2] * num_actions),
                    pstc=tauPSC)
        net.connect(actionvals,
                    valdiff,
                    transform=MU.diag([-2] * num_actions),
                    pstc=tauPSC)

        # calculate diff between curr_state and saved_state and use that to
        # gate valdiff (we only want to train the curr state based on previous
        # state when the two have similar values)
        # note: threshold > 0 so that there is a deadzone in the middle (when
        # the states are similar) where there will be no output inhibition
        statediff = net.make_array("statediff",
                                   N,
                                   stateD,
                                   intercept=(statediff_threshold, 1))

        net.connect(state_relay, statediff, pstc=tauPSC)
        net.connect(saved_state,
                    statediff,
                    transform=MU.diag([-1] * stateD),
                    pstc=tauPSC)

        net.connect(statediff,
                    valdiff,
                    func=lambda x: [abs(v) for v in x],
                    transform=[[-10] * stateD
                               for _ in range(valdiff.getNeurons())],
                    pstc=tauPSC)

        # connect up valdiff to the error signal for current Q values, and
        # expose the error signal for the previous Q values to the external
        # error
        if self.neuron_learning:
            net.connect(valdiff, actionvals.getTermination("error"))
            self.exposeTermination(old_actionvals.getTermination("error"),
                                   "error")
        else:
            net.connect(valdiff, state_dlnode.getTermination("error"))
            self.exposeTermination(old_state_dlnode.getTermination("error"),
                                   "error")

        self.exposeTermination(state_relay.getTermination("input"), "state")
        self.exposeTermination(saved_state.getTermination("transfer"),
                               "save_state")
        self.exposeOrigin(actionvals.getOrigin("X"), "vals")
        self.exposeOrigin(old_actionvals.getOrigin("X"), "old_vals")
Example #14
0
    def __init__(self, discount, rewardradius=1.0, Qradius=1.0):
        """Builds the ErrorCalc2 network.

        :param discount: discount factor, controls rate of integration
        :param rewardradius: expected radius of reward value
        :param Qradius: expected radius of Q values
        """

        self.name = "ErrorCalc"
        net = nef.Network(self, seed=HRLutils.SEED, quick=False)

        tauPSC = 0.007
        intPSC = 0.1
        N = 50

        # relay for current Q input
        currQ = net.make("currQ", 1, 1, node_factory=HRLutils.node_fac(),
                         mode="direct", radius=Qradius)
        currQ.fixMode()
        currQ.addDecodedTermination("input", [[1]], 0.001, False)

        # input population for resetting the network
        reset_nodefac = HRLutils.node_fac()
        reset_nodefac.setIntercept(IndicatorPDF(0.3, 1.0))
        reset = net.make("reset", N, 1, encoders=[[1]],
                         node_factory=reset_nodefac)
        reset.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE])
        # this population will begin outputting a value once the reset
        # signal exceeds the threshold, and that output will then be
        # used to reset the rest of the network

        reset.addDecodedTermination("input", [[1]], tauPSC, False)

        # relay for stored previous value of Q
        storeQ = net.make("storeQ", 1, 1, node_factory=HRLutils.node_fac(),
                          mode="direct", radius=Qradius)
        storeQ.fixMode()
        storeQ.addDecodedTermination("input", [[1]], 0.001, False)

        # calculate "discount" by integrating output of storeQ
        acc_storeQ = memory.Memory("acc_storeQ", N * 8, 1, inputscale=50)
        net.add(acc_storeQ)

        zero_input = net.make_input("zero_input", [0])

        net.connect(zero_input, acc_storeQ.getTermination("target"))
        net.connect(reset, acc_storeQ.getTermination("transfer"))

        # threshold storeQ value so it won't go below zero.  that is, if we
        # have negative Q values, we don't want to have a negative discount,
        # or that will just drive the highest (negative) Q value upwards, and
        # it will always be selected.  negative Q values are instead pushed
        # upwards by the PositiveBias mechanism.
        Qthresh = net.make("Qthresh", N * 2, 1, encoders=[[1]],
                           eval_points=[[x * 0.001] for x in range(1000)],
                           radius=Qradius, intercept=(0, 1))
        net.connect(storeQ, Qthresh, pstc=tauPSC)
        net.connect(Qthresh, acc_storeQ, pstc=intPSC,
                    transform=[[discount * intPSC]], func=lambda x: max(x[0],
                                                                        0.0))

        # accumulate  reward
        reward = memory.Memory("reward", N * 4, 1, radius=rewardradius,
                               inputscale=50)
        net.add(reward)

        reward.addDecodedTermination("input", [[intPSC]], intPSC, False)

        net.connect(zero_input, reward.getTermination("target"))
        net.connect(reset, reward.getTermination("transfer"))

        # put reward, currQ, storeQ, and discount together to calculate error
        error = net.make("error", N * 2, 1, node_factory=HRLutils.node_fac())

        net.connect(currQ, error, pstc=tauPSC)
        net.connect(reward, error, pstc=tauPSC)
        net.connect(storeQ, error, pstc=tauPSC, transform=[[-1]])
        net.connect(acc_storeQ, error, pstc=tauPSC, transform=[[-1]])

        self.exposeTermination(reward.getTermination("input"), "reward")
        self.exposeTermination(reset.getTermination("input"), "reset")
        self.exposeTermination(currQ.getTermination("input"), "currQ")
        self.exposeTermination(storeQ.getTermination("input"), "storeQ")
        self.exposeOrigin(error.getOrigin("X"), "X")